From 0f09bc503ee262761b50bb6027a912fe7e1a2c8f Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 10 Jan 2024 15:03:12 -0700
Subject: [PATCH 01/44] Adds ManuscriptPromptConfig, which performs
 filename->prompt resolution via YAML config files.

---
 libs/manubot_ai_editor/prompt_config.py | 144 ++++++++++++++++++++++++
 libs/manubot_ai_editor/utils.py         |  26 +++++
 2 files changed, 170 insertions(+)
 create mode 100644 libs/manubot_ai_editor/prompt_config.py
diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
new file mode 100644
index 0000000..56a9119
--- /dev/null
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -0,0 +1,144 @@
+"""
+Implements reading filename->prompt resolution rules from a YAML file, via the
+class ManuscriptPromptConfig.
+"""
+
+import os
+import re
+from typing import Optional
+import yaml
+from pathlib import Path
+
+from manubot_ai_editor.utils import get_obj_path
+
+class ManuscriptConfigException(Exception):
+    """
+    Parent class for exceptions raised by ManuscriptConfig's loading process.
+    """
+    pass
+
+class ManuscriptPromptConfig:
+    """
+    Loads configuration from two YAML files in 'content_dir':
+    -  ai_revision-prompts.yaml, which contains custom prompt definitions and/or
+    mappings of prompts to files
+    - ai_revision-config.yaml, containing general
+    configuration for the AI revision process
+
+    After loading, the main use of this class is to resolve a prompt for a given
+    filename. This is done by calling config.get_prompt_for_filename(<filename>),
+    which uses both the 'ai_revision-prompts.yaml' and 'ai_revision-config.yaml'
+    files to determine the prompt for a given filename.
+    """
+    def __init__(self, content_dir: str, title: str, keywords: str) -> None:
+        self.content_dir = Path(content_dir)
+        self.config = self._load_config()
+        self.prompts, self.prompts_files = self._load_custom_prompts()
+
+        # storing these so they can be interpolated into prompts
+        self.title = title
+        self.keywords = keywords
+
+    def _load_config(self) -> dict:
+        """
+        Loads general configuration from ai_revision-config.yaml
+        """
+        
+        config_file_path = os.path.join(self.content_dir, "ai_revision-config.yaml")
+
+        try:
+            with open(config_file_path, "r") as f:
+                return yaml.safe_load(f)
+        except FileNotFoundError:
+            return None
+
+        
+    def _load_custom_prompts(self) -> (dict, dict):
+        """
+        Loads custom prompts from ai_revision-prompts.yaml. The file
+        must contain either 'prompts' or 'prompts_files' as top-level keys.
+
+        'prompts' is a dictionary where keys are filenames and values are
+        prompts. For example: '{"intro": "proofread the following paragraph"}'.
+        The key can be used in the configuration file to specify a prompt for
+        a given file.
+
+        """
+
+        prompt_file_path = os.path.join(self.content_dir, "ai_revision-prompts.yaml")
+
+        try:
+            with open(prompt_file_path, "r") as f:
+                data = yaml.safe_load(f)
+        except FileNotFoundError:
+            # if the file doesn't exist, return None for both prompts and prompts_files
+            return (None, None)
+
+        # validate the existence of at least one of the keys we require
+        if 'prompts' not in data and 'prompts_files' not in data:
+            raise ManuscriptConfigException('The "ai_revision-prompts.yaml" YAML file must contain a "prompts" or a "prompts_files" key.')
+
+        # if the top-level key was 'prompts', that implies we need the `ai_revision-config.yaml`
+        # file to match those prompts to filenames, so raise an exception if it doesn't exist
+        if 'prompts' in data and not self.config:
+            raise ManuscriptConfigException(
+                'The "ai_revision-config.yaml" YAML file must exist if "ai_revision-prompts.yaml" begins with the "prompts" key.'
+            )
+
+        prompts = data.get('prompts', {})
+        prompts_files = data.get('prompts_files', {})
+
+        return (prompts, prompts_files)
+
+    def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (Optional[str], Optional[re.Match]):
+        """
+        Retrieves the prompt for a given filename. It checks the following sources
+        for a match in order:
+        - the 'ignore' list in ai_revision-config.yaml; if matched, returns None.
+        - the 'matchings' list in ai_revision-config.yaml; if matched, returns
+        the value for the referenced prompt, specified in ai_revision-prompts.yaml,
+        - the 'prompts_files' collection in ai_revision-prompts.yaml; if matched,
+        returns the prompt specified alongside the file pattern from that file.
+
+        If a match is found, returns a tuple of the prompt text and the match object.
+        If the file is in the ignore list, returns (None, m), where m is the match
+        object that matched the ignore pattern.
+        If nothing matched and 'use_default' is True, returns (default_prompt,
+        None) where 'default_prompt' is the default prompt specified in
+        ai_revision-config.yaml, if available.
+        """
+
+        # first, check the ignore list to see if we should bail early
+        for ignore in get_obj_path(self.config, ('files', 'ignore'), missing=[]):
+            if (m := re.search(ignore, filename)):
+                return (None, m)
+
+        # FIXME: which takes priority, the files collection in ai_revision-config.yaml
+        #  or the prompt_file? we went with config taking precendence for now
+
+        # then, consult ai_revision-config.yaml's 'matchings' collection if a
+        # match is found, use the prompt ai_revision-prompts.yaml
+        for entry in get_obj_path(self.config, ('files', 'matchings'), missing=[]):
+            # iterate through all the 'matchings' entries, trying to find one
+            # that matches the current filename
+            for pattern in entry['files']:
+                if (m := re.search(pattern, filename)):
+                    # since we matched, use the 'prompts' collection to return a
+                    # named prompt corresponding to the one from the 'matchings'
+                    # collection
+                    return (
+                        self.prompts.get(entry['prompt'], None) if self.prompts else None, m
+                    )
+
+        # since we haven't found a match yet, consult ai_revision-prompts.yaml's
+        # 'prompts_files' collection
+        if self.prompts_files:
+            for pattern, prompt in self.prompts_files.items():
+                if (m := re.search(pattern, filename)):
+                    return (prompt, m)
+
+        # finally, return the default prompt
+        return (
+            get_obj_path(self.config, ('files', 'default_prompt')) if use_default else None,
+            None
+        )
diff --git a/libs/manubot_ai_editor/utils.py b/libs/manubot_ai_editor/utils.py
index cd6cfab..1873109 100644
--- a/libs/manubot_ai_editor/utils.py
+++ b/libs/manubot_ai_editor/utils.py
@@ -24,3 +24,29 @@ def starts_with_similar(string: str, prefix: str, threshold: float = 0.8) -> boo
     return (
         difflib.SequenceMatcher(None, prefix, string[: len(prefix)]).ratio() > threshold
     )
+
+def get_obj_path(target: any, path: tuple, missing=None):
+    """
+    Traverse a nested object using a tuple of keys, returning the last resolved
+    value in the path. If any key is not found, return 'missing' (default None).
+
+    >>> get_obj_path({'a': {'b': {'c': 1}}}, ('a', 'b', 'c'))
+    1
+    >>> get_obj_path({'a': {'b': {'c': 1}}}, ('a', 'b', 'd')) is None
+    True
+    >>> get_obj_path({'a': {'b': {'c': 1}}}, ('a', 'b', 'd'), missing=2)
+    2
+    >>> get_obj_path({'a': [100, {'c': 1}]}, ('a', 1, 'c'))
+    1
+    >>> get_obj_path({'a': [100, {'c': 1}]}, ('a', 1, 'd')) is None
+    True
+    >>> get_obj_path({'a': [100, {'c': 1}]}, ('a', 3)) is None
+    True
+    """
+    try:
+        for key in path:
+            target = target[key]
+    except (KeyError, IndexError, TypeError):
+        return missing
+        
+    return target

From f2f96a70c390b556063d406ae08361f71e850b37 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 10 Jan 2024 15:03:56 -0700
Subject: [PATCH 02/44] Integrates prompt->filename resolution into
 ManuscriptEditor.revise_manuscript(), passed down to model-specific prompt
 resolution via the 'resolved_prompt' argument.

---
 libs/manubot_ai_editor/editor.py | 25 ++++++++++++++++++++++---
 libs/manubot_ai_editor/models.py | 31 ++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
index e8a337d..b5f6627 100644
--- a/libs/manubot_ai_editor/editor.py
+++ b/libs/manubot_ai_editor/editor.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 from manubot_ai_editor import env_vars
+from manubot_ai_editor.prompt_config import ManuscriptPromptConfig
 from manubot_ai_editor.models import ManuscriptRevisionModel
 from manubot_ai_editor.utils import (
     get_yaml_field,
@@ -27,6 +28,12 @@ def __init__(self, content_dir: str | Path):
         self.title = get_yaml_field(metadata_file, "title")
         self.keywords = get_yaml_field(metadata_file, "keywords")
 
+        self.prompt_config = ManuscriptPromptConfig(
+            content_dir=content_dir,
+            title=self.title,
+            keywords=self.keywords
+        )
+
     @staticmethod
     def prepare_paragraph(paragraph: list[str]) -> str:
         """
@@ -81,6 +88,7 @@ def revise_and_write_paragraph(
         paragraph: list[str],
         revision_model: ManuscriptRevisionModel,
         section_name: str = None,
+        resolved_prompt: str = None,
         outfile=None,
     ) -> None | tuple[str, str]:
         """
@@ -89,6 +97,7 @@ def revise_and_write_paragraph(
         Arguments:
             paragraph: list of lines of the paragraph.
             section_name: name of the section the paragraph belongs to.
+            resolved_prompt: a prompt resolved via the ai_revision prompt config; None if unavailable
             revision_model: model to use for revision.
             outfile: file object to write the revised paragraph to.
 
@@ -116,6 +125,7 @@ def revise_and_write_paragraph(
             paragraph_revised = revision_model.revise_paragraph(
                 paragraph_text,
                 section_name,
+                resolved_prompt=resolved_prompt
             )
 
             if paragraph_revised.strip() == "":
@@ -248,6 +258,7 @@ def revise_file(
         output_dir: Path | str,
         revision_model: ManuscriptRevisionModel,
         section_name: str = None,
+        resolved_prompt: str = None
     ):
         """
         It revises an entire Markdown file and writes the revised file to the output directory.
@@ -258,6 +269,7 @@ def revise_file(
             output_dir (Path | str): path to the directory where the revised file will be written.
             revision_model (ManuscriptRevisionModel): model to use for revision.
             section_name (str, optional): Defaults to None. If so, it will be inferred from the filename.
+            resolved_prompt (str, optional): A prompt resolved via ai_revision prompt config files, which overrides any custom or section-derived prompts; None if unavailable.
         """
         input_filepath = self.content_dir / input_filename
         assert input_filepath.exists(), f"Input file {input_filepath} does not exist"
@@ -376,7 +388,7 @@ def revise_file(
 
                     # revise and write paragraph to output file
                     self.revise_and_write_paragraph(
-                        paragraph, revision_model, section_name, outfile
+                        paragraph, revision_model, section_name, resolved_prompt=resolved_prompt, outfile=outfile
                     )
 
                     # clear the paragraph list
@@ -418,7 +430,7 @@ def revise_file(
             # output file
             if paragraph:
                 self.revise_and_write_paragraph(
-                    paragraph, revision_model, section_name, outfile
+                    paragraph, revision_model, section_name, resolved_prompt=None, outfile=outfile
                 )
 
     def revise_manuscript(
@@ -452,7 +464,13 @@ def revise_manuscript(
 
             filename_section = self.get_section_from_filename(filename.name)
 
-            # we do not process the file if it has no section and there is no custom prompt
+            # use the ai_revision prompt config to attempt to resolve a prompt
+            resolved_prompt, _ = self.prompt_config.get_prompt_for_filename(filename.name)
+
+            # we do not process the file if all hold:
+            # 1. it has no section
+            # 2. we're unable to resolve it via ai_revision prompt configuration
+            # 2. there is no custom prompt
             if filename_section is None and (
                 env_vars.CUSTOM_PROMPT not in os.environ
                 or os.environ[env_vars.CUSTOM_PROMPT].strip() == ""
@@ -472,4 +490,5 @@ def revise_manuscript(
                 output_dir,
                 revision_model,
                 section_name=filename_section,
+                resolved_prompt=resolved_prompt
             )
diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index ade7c63..3965caa 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -18,13 +18,14 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def revise_paragraph(self, paragraph_text, section_name):
+    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         """
         It revises a paragraph of a manuscript from a given section.
 
         Args:
             paragraph_text (str): text of the paragraph to revise.
             section_name (str): name of the section the paragraph belongs to.
+            resolved_prompt (str): prompt resolved via ai_revision config files, if available
 
         Returns:
             Revised paragraph text.
@@ -51,7 +52,7 @@ def __init__(self, add_paragraph_marks=False):
         self.sentence_end_pattern = re.compile(r".\n")
         self.add_paragraph_marks = add_paragraph_marks
 
-    def revise_paragraph(self, paragraph_text, section_name):
+    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         if self.add_paragraph_marks:
             return (
                 "%%% PARAGRAPH START %%%\n"
@@ -74,7 +75,7 @@ def __init__(self, revised_header: str = "Revised:"):
         super().__init__()
         self.revised_header = revised_header
 
-    def revise_paragraph(self, paragraph_text, section_name):
+    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         revised_paragraph = super().revise_paragraph(paragraph_text, section_name)
         return f"{self.revised_header}{revised_paragraph}"
 
@@ -89,7 +90,7 @@ def __init__(self):
         super().__init__()
         self.sentence_end_pattern = re.compile(r"\n")
 
-    def revise_paragraph(self, paragraph_text: str, section_name: str) -> str:
+    def revise_paragraph(self, paragraph_text: str, section_name: str, resolved_prompt=None) -> str:
         """
         It takes each sentence of the paragraph and randomizes the words.
         """
@@ -248,7 +249,7 @@ def __init__(
         self.several_spaces_pattern = re.compile(r"\s+")
 
     def get_prompt(
-        self, paragraph_text: str, section_name: str = None
+        self, paragraph_text: str, section_name: str = None, resolved_prompt: str = None
     ) -> str | tuple[str, str]:
         """
         Returns the prompt to be used for the revision of a paragraph that
@@ -259,6 +260,7 @@ def get_prompt(
         Args:
             paragraph_text: text of the paragraph to revise.
             section_name: name of the section the paragraph belongs to.
+            resolved_prompt: prompt resolved via ai_revision config, if available
 
         Returns:
             If self.endpoint != "edits", then returns a string with the prompt to be used by the model for the revision of the paragraph.
@@ -270,6 +272,22 @@ def get_prompt(
              2) the paragraph to revise.
         """
 
+        # prompts are resolved in the following order, with the first satisfied
+        # condition taking effect:
+
+        # 1. if a custom prompt is specified via the env var specified by
+        #    env_vars.CUSTOM_PROMPT, then the text in that env var is used as
+        #    the prompt.
+        # 2. if the files ai_revision-config.yaml and/or ai_revision-prompt.yaml
+        #    are available, then a prompt resolved from the filename via those
+        #    config files is used. (this is initially resolved in
+        #    ManuscriptEditor.revise_manuscript() and passed down to here via
+        #    the 'resolved_prompt' argument.)
+        # 3. if a section_name is specified, then a canned section-specific
+        #    prompt matching the section name is used.
+        # 4. finally, if none of the above are true, then a generic prompt is
+        #    used.
+
         custom_prompt = None
         if ((c := os.environ.get(env_vars.CUSTOM_PROMPT, "").strip()) and c != ""):
             custom_prompt = c
@@ -287,6 +305,9 @@ def get_prompt(
             # FIXME: if {paragraph_text} is in the prompt, this won't work for the edits endpoint
             #  a simple workaround is to remove {paragraph_text} from the prompt
             prompt = custom_prompt.format(**placeholders)
+        elif resolved_prompt:
+            # use the resolved prompt from the ai_revision config files, if available
+            prompt = resolved_prompt
         elif section_name in ("abstract",):
             prompt = f"""
                 Revise the following paragraph from the {section_name} of an academic paper (with the title '{self.title}' and keywords '{", ".join(self.keywords)}')

From f85aa87f66ec9e434e95286214f13c14d6ef0bd1 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 10 Jan 2024 15:05:18 -0700
Subject: [PATCH 03/44] Adds a few tests for ManuscriptPromptConfig

---
 .../phenoplier_full/00.front-matter.md        |   77 +
 .../phenoplier_full/01.abstract.md            |   12 +
 .../phenoplier_full/02.introduction.md        |   40 +
 .../phenoplier_full/04.00.results.md          |   10 +
 .../04.05.00.results_framework.md             |   66 +
 .../phenoplier_full/04.05.01.crispr.md        |   42 +
 .../04.15.drug_disease_prediction.md          |   94 +
 .../04.20.00.traits_clustering.md             |   95 +
 .../phenoplier_full/05.discussion.md          |   77 +
 .../phenoplier_full/07.00.methods.md          |  408 +
 .../phenoplier_full/10.references.md          |    4 +
 .../phenoplier_full/15.acknowledgements.md    |   48 +
 .../50.00.supplementary_material.md           |  743 ++
 .../phenoplier_full/ai_revision-config.yaml   |   25 +
 .../phenoplier_full/ai_revision-prompts.yaml  |   15 +
 .../phenoplier_full/manual-references.json    | 9296 +++++++++++++++++
 .../manuscripts/phenoplier_full/metadata.yaml |  134 +
 tests/test_prompt_config.py                   |  104 +
 18 files changed, 11290 insertions(+)
 create mode 100644 tests/manuscripts/phenoplier_full/00.front-matter.md
 create mode 100644 tests/manuscripts/phenoplier_full/01.abstract.md
 create mode 100644 tests/manuscripts/phenoplier_full/02.introduction.md
 create mode 100644 tests/manuscripts/phenoplier_full/04.00.results.md
 create mode 100644 tests/manuscripts/phenoplier_full/04.05.00.results_framework.md
 create mode 100644 tests/manuscripts/phenoplier_full/04.05.01.crispr.md
 create mode 100644 tests/manuscripts/phenoplier_full/04.15.drug_disease_prediction.md
 create mode 100644 tests/manuscripts/phenoplier_full/04.20.00.traits_clustering.md
 create mode 100644 tests/manuscripts/phenoplier_full/05.discussion.md
 create mode 100644 tests/manuscripts/phenoplier_full/07.00.methods.md
 create mode 100644 tests/manuscripts/phenoplier_full/10.references.md
 create mode 100644 tests/manuscripts/phenoplier_full/15.acknowledgements.md
 create mode 100644 tests/manuscripts/phenoplier_full/50.00.supplementary_material.md
 create mode 100644 tests/manuscripts/phenoplier_full/ai_revision-config.yaml
 create mode 100644 tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml
 create mode 100644 tests/manuscripts/phenoplier_full/manual-references.json
 create mode 100644 tests/manuscripts/phenoplier_full/metadata.yaml
 create mode 100644 tests/test_prompt_config.py

diff --git a/tests/manuscripts/phenoplier_full/00.front-matter.md b/tests/manuscripts/phenoplier_full/00.front-matter.md
new file mode 100644
index 0000000..c86f5cb
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/00.front-matter.md
@@ -0,0 +1,77 @@
+{##
+  This file contains a Jinja2 front-matter template that adds version and authorship information.
+  Changing the Jinja2 templates in this file may cause incompatibility with Manubot updates.
+  Pandoc automatically inserts title from metadata.yaml, so it is not included in this template.
+##}
+
+_A DOI-citable version of this manuscript is available at<br /><https://doi.org/10.1038/s41467-023-41057-4>_
+
+<!-- {## Template to insert build date and source ##}
+<small><em>
+This manuscript
+{% if manubot.ci_source is defined and manubot.ci_source.provider == "appveyor" -%}
+([permalink]({{manubot.ci_source.artifact_url}}))
+{% elif manubot.html_url_versioned is defined -%}
+([permalink]({{manubot.html_url_versioned}}))
+{% endif -%}
+was automatically generated
+{% if manubot.ci_source is defined -%}
+from [{{manubot.ci_source.repo_slug}}@{{manubot.ci_source.commit | truncate(length=7, end='', leeway=0)}}](https://github.com/{{manubot.ci_source.repo_slug}}/tree/{{manubot.ci_source.commit}})
+{% endif -%}
+on {{manubot.generated_date_long}}.
+</em></small> -->
+
+{% if manubot.date_long != manubot.generated_date_long -%}
+Published: {{manubot.date_long}}
+{% endif %}
+
+## Authors
+
+{## Template for listing authors ##}
+{% for author in manubot.authors %}
++ **{{author.name}}**
+  {% if author.corresponding is defined and author.corresponding == true -%}^[✉](#correspondence)^{%- endif -%}
+  <br>
+  {%- set has_ids = false %}
+  {%- if author.orcid is defined and author.orcid is not none %}
+    {%- set has_ids = true %}
+    ![ORCID icon](images/orcid.svg){.inline_icon width=16 height=16}
+    [{{author.orcid}}](https://orcid.org/{{author.orcid}})
+  {%- endif %}
+  {%- if author.github is defined and author.github is not none %}
+    {%- set has_ids = true %}
+    · ![GitHub icon](images/github.svg){.inline_icon width=16 height=16}
+    [{{author.github}}](https://github.com/{{author.github}})
+  {%- endif %}
+  {%- if author.twitter is defined and author.twitter is not none %}
+    {%- set has_ids = true %}
+    · ![Twitter icon](images/twitter.svg){.inline_icon width=16 height=16}
+    [{{author.twitter}}](https://twitter.com/{{author.twitter}})
+  {%- endif %}
+  {%- if author.mastodon is defined and author.mastodon is not none and author["mastodon-server"] is defined and author["mastodon-server"] is not none %}
+    {%- set has_ids = true %}
+    · ![Mastodon icon](images/mastodon.svg){.inline_icon width=16 height=16}
+    [\@{{author.mastodon}}@{{author["mastodon-server"]}}](https://{{author["mastodon-server"]}}/@{{author.mastodon}})
+  {%- endif %}
+  {%- if has_ids %}
+    <br>
+  {%- endif %}
+  <small>
+  {%- if author.affiliations is defined and author.affiliations|length %}
+     {{author.affiliations | join('; ')}}
+  {%- endif %}
+  {%- if author.funders is defined and author.funders|length %}
+     · Funded by {{author.funders | join('; ')}}
+  {%- endif %}
+  </small>
+{% endfor %}
+
+::: {#correspondence}
+✉ — Correspondence possible via {% if manubot.ci_source is defined -%}[GitHub Issues](https://github.com/{{manubot.ci_source.repo_slug}}/issues){% else %}GitHub Issues{% endif %}
+{% if manubot.authors|map(attribute='corresponding')|select|max -%}
+or email to
+{% for author in manubot.authors|selectattr("corresponding") -%}
+{{ author.name }} \<{{ author.email }}\>{{ ", " if not loop.last else "." }}
+{% endfor %}
+{% endif %}
+:::
diff --git a/tests/manuscripts/phenoplier_full/01.abstract.md b/tests/manuscripts/phenoplier_full/01.abstract.md
new file mode 100644
index 0000000..6976ff0
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/01.abstract.md
@@ -0,0 +1,12 @@
+## Abstract {.page_break_before}
+
+Genes act in concert with each other in specific contexts to perform their functions.
+Determining how these genes influence complex traits requires a mechanistic understanding of expression regulation across different conditions.
+It has been shown that this insight is critical for developing new therapies.
+Transcriptome-wide association studies have helped uncover the role of individual genes in disease-relevant mechanisms.
+However, modern models of the architecture of complex traits predict that gene-gene interactions play a crucial role in disease origin and progression.
+Here we introduce PhenoPLIER, a computational approach that maps gene-trait associations and pharmacological perturbation data into a common latent representation for a joint analysis.
+This representation is based on modules of genes with similar expression patterns across the same conditions.
+We observe that diseases are significantly associated with gene modules expressed in relevant cell types, and our approach is accurate in predicting known drug-disease pairs and inferring mechanisms of action.
+Furthermore, using a CRISPR screen to analyze lipid regulation, we find that functionally important players lack associations but are prioritized in trait-associated modules by PhenoPLIER.
+By incorporating groups of co-expressed genes, PhenoPLIER can contextualize genetic associations and reveal potential targets missed by single-gene strategies.
diff --git a/tests/manuscripts/phenoplier_full/02.introduction.md b/tests/manuscripts/phenoplier_full/02.introduction.md
new file mode 100644
index 0000000..aac3578
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/02.introduction.md
@@ -0,0 +1,40 @@
+## Introduction
+
+Genes work together in context-specific networks to carry out different functions [@pmid:19104045; @doi:10.1038/ng.3259].
+Variations in these genes can change their functional role and, at a higher level, affect disease-relevant biological processes [@doi:10.1038/s41467-018-06022-6].
+In this context, determining how genes influence complex traits requires mechanistically understanding expression regulation across different cell types [@doi:10.1126/science.aaz1776; @doi:10.1038/s41586-020-2559-3; @doi:10.1038/s41576-019-0200-9], which in turn should lead to improved treatments [@doi:10.1038/ng.3314; @doi:10.1371/journal.pgen.1008489].
+Previous studies have described different regulatory DNA elements [@doi:10.1038/nature11247; @doi:10.1038/nature14248; @doi:10.1038/nature12787; @doi:10.1038/s41586-020-03145-z; @doi:10.1038/s41586-020-2559-3] including genetic effects on gene expression across different tissues [@doi:10.1126/science.aaz1776].
+Integrating functional genomics data and GWAS data [@doi:10.1038/s41588-018-0081-4; @doi:10.1016/j.ajhg.2018.04.002; @doi:10.1038/s41588-018-0081-4; @doi:10.1038/ncomms6890] has improved the identification of these transcriptional mechanisms that, when dysregulated, commonly result in tissue- and cell lineage-specific pathology [@pmid:20624743; @pmid:14707169; @doi:10.1073/pnas.0810772105].
+
+
+Given the availability of gene expression data across several tissues [@doi:10.1038/nbt.3838; @doi:10.1038/s41467-018-03751-6; @doi:10.1126/science.aaz1776; @doi:10.1186/s13040-020-00216-9], an effective approach to identify these biological processes is the transcription-wide association study (TWAS), which integrates expression quantitative trait loci (eQTLs) data to provide a mechanistic interpretation for GWAS findings.
+TWAS relies on testing whether perturbations in gene regulatory mechanisms mediate the association between genetic variants and human diseases [@doi:10.1371/journal.pgen.1009482; @doi:10.1038/ng.3506; @doi:10.1371/journal.pgen.1007889; @doi:10.1038/ng.3367], and these approaches have been highly successful not only in understanding disease etiology at the transcriptome level [@pmid:33931583; @doi:10.1101/2021.10.21.21265225; @pmid:31036433] but also in disease-risk prediction (polygenic scores) [@doi:10.1186/s13059-021-02591-w] and drug repurposing [@doi:10.1038/nn.4618] tasks.
+However, TWAS works at the individual gene level, which does not capture more complex interactions at the network level.
+
+
+These gene-gene interactions play a crucial role in current theories of the architecture of complex traits, such as the omnigenic model [@doi:10.1016/j.cell.2017.05.038], which suggests that methods need to incorporate this complexity to disentangle disease-relevant mechanisms.
+Widespread gene pleiotropy, for instance, reveals the highly interconnected nature of transcriptional networks [@doi:10.1038/s41588-019-0481-0; @doi:10.1038/ng.3570], where potentially all genes expressed in disease-relevant cell types have a non-zero effect on the trait [@doi:10.1016/j.cell.2017.05.038; @doi:10.1016/j.cell.2019.04.014].
+One way to learn these gene-gene interactions is using the concept of gene module: a group of genes with similar expression profiles across different conditions [@pmid:22955619; @pmid:25344726; @doi:10.1038/ng.3259].
+In this context, several unsupervised approaches have been proposed to infer these gene-gene connections by extracting gene modules from co-expression patterns [@pmid:9843981; @pmid:24662387; @pmid:16333293].
+Matrix factorization techniques like independent or principal component analysis (ICA/PCA) have shown superior performance in this task [@doi:10.1038/s41467-018-03424-4] since they capture local expression effects from a subset of samples and can handle modules overlap effectively.
+Therefore, integrating genetic studies with gene modules extracted using unsupervised learning could further improve our understanding of disease origin [@pmid:25344726] and progression [@pmid:18631455].
+
+
+Here we propose PhenoPLIER, an omnigenic approach that provides a gene module perspective to genetic studies.
+The flexibility of our method allows integrating different data modalities into the same representation for a joint analysis.
+We show that this module perspective can infer how groups of functionally-related genes influence complex traits, detect shared and distinct transcriptomic properties among traits, and predict how pharmacological perturbations affect genes' activity to exert their effects.
+PhenoPLIER maps gene-trait associations and drug-induced transcriptional responses into a common latent representation.
+For this, we integrate thousands of gene-trait associations (using TWAS from PhenomeXcan [@doi:10.1126/sciadv.aba2083]) and transcriptional profiles of drugs (from LINCS L1000 [@doi:10.1016/j.cell.2017.10.049]) into a low-dimensional space learned from public gene expression data on tens of thousands of RNA-seq samples (recount2 [@doi:10.1016/j.cels.2019.04.003; @doi:10.1038/nbt.3838]).
+We use a latent representation defined by a matrix factorization approach [@doi:10.1038/s41592-019-0456-1; @doi:10.1016/j.cels.2019.04.003] that extracts gene modules with certain sparsity constraints and preferences for those that align with prior knowledge (pathways).
+When mapping gene-trait associations to this reduced expression space, we observe that diseases are significantly associated with gene modules expressed in relevant cell types: such as hypothyroidism with T cells, corneal endothelial cells with keratometry measurements, hematological assays on specific blood cell types, plasma lipids with adipose tissue, and neuropsychiatric disorders with different brain cell types.
+Moreover, since PhenoPLIER can use models derived from large and heterogeneous RNA-seq datasets, we can also identify modules associated with cell types under specific stimuli or disease states.
+We observe that significant module-trait associations in PhenomeXcan (our discovery cohort) replicated in the Electronic Medical Records and Genomics (eMERGE) network phase III [@doi:10.1038/gim.2013.72; @doi:10.1101/2021.10.21.21265225] (our replication cohort).
+Furthermore, we perform a CRISPR screen to analyze lipid regulation in HepG2 cells.
+We observe more robust trait associations with modules than with individual genes, even when single genes known to be involved in lipid metabolism did not reach genome-wide significance.
+Compared to a single-gene approach, our module-based method also better predicts FDA-approved drug-disease links by capturing tissue-specific pathophysiological mechanisms linked with the mechanism of action of drugs (e.g., niacin with cardiovascular traits via a known immune mechanism).
+This improved drug-disease prediction suggests that modules may provide a better means to examine drug-disease relationships than individual genes.
+Finally, exploring the phenotype-module space reveals stable trait clusters associated with relevant tissues, including a complex branch involving lipids with cardiovascular, autoimmune, and neuropsychiatric disorders.
+In summary, instead of considering single genes associated with different complex traits, PhenoPLIER incorporates groups of genes that act together to carry out different functions in specific cell types.
+This approach improves robustness in detecting and interpreting genetic associations, and here we show how it can prioritize alternative and potentially more promising candidate targets even when known single gene associations are not detected.
+The approach represents a conceptual shift in the interpretation of genetic studies.
+It has the potential to extract mechanistic insight from statistical associations to enhance the understanding of complex diseases and their therapeutic modalities.
diff --git a/tests/manuscripts/phenoplier_full/04.00.results.md b/tests/manuscripts/phenoplier_full/04.00.results.md
new file mode 100644
index 0000000..60da89d
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/04.00.results.md
@@ -0,0 +1,10 @@
+## Results
+
+<!--
+
+Some papers that might be interesting:
+
+https://www.nature.com/articles/s41591-020-01221-5
+Air pollution linked to neurodegeneration markers
+
+-->
diff --git a/tests/manuscripts/phenoplier_full/04.05.00.results_framework.md b/tests/manuscripts/phenoplier_full/04.05.00.results_framework.md
new file mode 100644
index 0000000..4cef3ef
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/04.05.00.results_framework.md
@@ -0,0 +1,66 @@
+### PhenoPLIER: an integration framework based on gene co-expression patterns
+
+![
+**Schematic of the PhenoPLIER framework.**
+**a)** High-level schematic of PhenoPLIER (a gene module-based method) in the context of TWAS (single-gene) and GWAS (single-variant).
+In GWAS, we identify variants associated with traits.
+In TWAS, first, we identify variants that are associated with gene expression levels (eQTLs); then, prediction models based on eQTLs are used to impute gene expression, which is used to compute gene-trait associations.
+Resources such as LINCS L1000 provide information about how a drug perturbs gene expression; at the bottom-right corner, we show how a drug downregulates two genes (A and C).
+In PhenoPLIER, these data types are integrated using groups of genes co-expressed across one or more conditions (such as cell types) that we call gene modules or latent variables/LVs. Created with BioRender.com.
+**b)** The integration process in PhenoPLIER uses low-dimensional representations (matrices $\mathbf{Z}$ and $\mathbf{B}$) learned from large gene expression datasets (top).
+We used gene-drug information $\mathbf{L}$ from LINCS L1000 and gene-trait associations $\mathbf{M}$ from TWAS: PhenomeXcan was used as the discovery cohort, and eMERGE as replication (middle).
+PhenoPLIER provides three computational components (bottom):
+1) an LV-based regression model that associates an LV $j$ ($\mathbf{Z}_j$) with a trait $i$ ($\mathbf{M}_i$),
+2) a clustering framework that learns groups of traits from TWAS associations projected into the LV space ($\hat{\mathbf{M}}$),
+and 3) an LV-based drug repurposing approach that uses the projection of TWAS ($\hat{\mathbf{M}}$) and LINCS L1000 ($\hat{\mathbf{L}}$) into the LV space.
+**c)** Genes that are part of LV603, termed as a neutrophil signature [@doi:10.1016/j.cels.2019.04.003], were expressed in relevant cell types (top), with 53 independent samples expressed in Neutrophils, 59 in Granulocytes, and 20 in Whole blood, 56 in PBMC, 8 in mDCs, 29 in Monocytes, and 5 in Epithelial cells (the boxplot shows the 25th, 50th and 75th percentiles while the whiskers extend to the minimum/maximum values).
+LV603 was associated in PhenoPLIER with neutrophil counts and other white blood cells (bottom, showing the top 10 traits for LV603 after projecting gene-trait associations in PhenomeXcan).
+eQTLs: expression quantitative trait loci;
+MVN: multivariate normal distribution;
+PBMC: peripheral blood mononuclear cells;
+mDCs: myeloid dendritic cells.
+](images/entire_process/entire_process.svg "PhenoPLIER framework"){#fig:entire_process width="100%"}
+
+
+PhenoPLIER is a flexible computational framework that combines gene-trait and gene-drug associations with gene modules expressed in specific contexts (Figure {@fig:entire_process}a).
+The approach uses a latent representation (with latent variables or LVs representing gene modules) derived from a large gene expression compendium (Figure {@fig:entire_process}b, top) to integrate TWAS with drug-induced transcriptional responses (Figure {@fig:entire_process}b, middle) for a joint analysis.
+The approach consists in three main components (Figure {@fig:entire_process}b, bottom, see [Methods](#sec:methods)):
+1) an LV-based regression model to compute an association between an LV and a trait,
+2) a clustering framework to learn groups of traits with shared transcriptomic properties,
+and 3) an LV-based drug repurposing approach that links diseases to potential treatments.
+We performed extensive simulations for our regression model ([Supplementary Note 1](#sm:reg:null_sim)) and clustering framework ([Supplementary Note 2](#sm:clustering:null_sim)) to ensure proper calibration and expected results under a model of no association.
+
+
+We used TWAS results from PhenomeXcan [@doi:10.1126/sciadv.aba2083] and the eMERGE network [@doi:10.1101/2021.10.21.21265225] as discovery and replication cohorts, respectively ([Methods](#sec:methods:twas)).
+PhenomeXcan provides gene-trait associations for 4,091 different diseases and traits from the UK Biobank [@doi:10.1038/s41586-018-0579-z] and other studies, whereas the analyses on eMERGE were performed across 309 phecodes.
+TWAS results were derived using two statistical methods (see [Methods](#sec:methods:predixcan)):
+1) Summary-MultiXcan (S-MultiXcan) associations were used for the regression and clustering components,
+and 2) Summary-PrediXcan (S-PrediXcan) associations were used for the drug repurposing component.
+In addition, we also used colocalization results, which provide a probability of overlap between the GWAS and eQTL signals.
+For the drug-repurposing approach, we used transcriptional responses to small molecule perturbations from LINCS L1000 [@doi:10.1016/j.cell.2017.10.049] comprising 1,170 compounds.
+
+
+The latent gene expression representation was obtained from the MultiPLIER models [@doi:10.1016/j.cels.2019.04.003], which were derived by applying a matrix factorization method (the pathway-level information extractor or PLIER [@doi:10.1038/s41592-019-0456-1]) to recount2 [@doi:10.1038/nbt.3838] -- a uniformly-curated collection of transcript-level gene expression quantified by RNA-seq in a large, diverse set of samples collected across a range of disease states, cell types differentiation stages, and various stimuli (see [Methods](#sec:methods:multiplier)).
+The MultiPLIER models extracted 987 LVs by optimizing data reconstruction but also the alignment of LVs with prior knowledge/pathways.
+
+
+Each LV or gene module represents a group of weighted genes expressed together in the same tissues and cell types as a functional unit.
+Since LVs might represent a functional set of genes regulated by the same transcriptional program [@doi:10.1186/1471-2164-7-187; @doi:10.1186/s13059-019-1835-8], we conjecture that the projection of TWAS and pharmacologic perturbations data into this latent space could provide a better mechanistic understanding.
+For this projection of different data modalities into the same space, PhenoPLIER converts gene associations to an LV score: all genes' standardized effect sizes for a trait (from TWAS) or differential expression values for a drug (from pharmacologic perturbation data) are multiplied by the LV genes' weights and summed, producing a single value.
+Instead of looking at individual genes, this process links different traits and drugs to functionally-related groups of genes or LVs.
+PhenoPLIER uses LVs annotations about the specific conditions where the group of genes is expressed, such as cell types and tissues, even at specific developmental stages, disease stages or under distinct stimuli.
+Although this is not strictly necessary for PhenoPLIER to work, these annotations can dramatically improve the interpretability of results.
+MultiPLIER's models provide this information by linking LVs to samples, which may be annotated for experimental conditions (represented by matrix $\mathbf{B}$ at the top of Figure {@fig:entire_process}b) in which genes in an LV are expressed.
+An example of this is shown in Figure {@fig:entire_process}c.
+In the original MultiPLIER study, the authors reported that one of the latent variables, identified as LV603, was associated with a known neutrophil pathway and highly correlated with neutrophil count estimates from whole blood RNA-seq profiles [@doi:10.1186/s13059-016-1070-5].
+We analyzed LV603 using PhenoPLIER and found that
+1) neutrophil counts and other white blood cell traits were ranked among the top 10 traits out of 4,091 (Figure {@fig:entire_process}c, bottom), and basophils count and percentage were significantly associated with this LV when using our regression method (Table @tbl:sup:phenomexcan_assocs:lv603),
+and 2) LV603's genes were expressed in highly relevant cell types (Figure {@fig:entire_process}c, top).
+These initial results suggested that groups of functionally related and co-expressed genes tend to correspond to groups of trait-associated genes, and the approach can link transcriptional mechanisms from large and diverse dataset collections to complex traits.
+
+
+Therefore, PhenoPLIER allows the user to address specific questions, namely:
+do disease-associated genes belong to modules expressed in specific tissues and cell types?
+Are these cell type-specific modules associated with _different_ diseases, thus potentially representing a "network pleiotropy" example from an omnigenic point of view [@doi:10.1016/j.cell.2017.05.038]?
+Is there a subset of module's genes that is closer to the definition of "core" genes (i.e., directly affecting the trait with no mediated regulation of other genes [@doi:10.1016/j.cell.2019.04.014]) and thus represents alternative and potentially better candidate targets?
+Are drugs perturbing these transcriptional mechanisms, and can they suggest potential mechanisms of action?
diff --git a/tests/manuscripts/phenoplier_full/04.05.01.crispr.md b/tests/manuscripts/phenoplier_full/04.05.01.crispr.md
new file mode 100644
index 0000000..df169f4
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/04.05.01.crispr.md
@@ -0,0 +1,42 @@
+### LVs link genes that alter lipid accumulation with relevant traits and tissues
+
+Our first experiment attempted to answer whether genes in a disease-relevant LV could represent potential therapeutic targets.
+For this, the first step was to obtain a set of genes strongly associated with a phenotype of interest.
+Therefore, we performed a fluorescence-based CRISPR-Cas9 in the HepG2 cell line and identified 462 genes associated with lipid regulation ([Methods](#sec:methods:crispr)).
+From these, we selected two high-confidence gene sets that either caused a decrease or increase of lipids:
+a lipids-decreasing gene-set with eight genes: *BLCAP*, *FBXW7*, *INSIG2*, *PCYT2*, *PTEN*, *SOX9*, *TCF7L2*, *UBE2J2*;
+and a lipids-increasing gene-set with six genes: *ACACA*, *DGAT2*, *HILPDA*, *MBTPS1*, *SCAP*, *SRPR* (Supplementary Data 2).
+
+
+![
+**Tissues and traits associated with a gene module related to lipid metabolism (LV246).**
+<!--  -->
+**a)** Top cell types/tissues in which LV246's genes are expressed.
+Values in the $y$-axis come from matrix $\mathbf{B}$ in the MultiPLIER models (Figure {@fig:entire_process}b, see Methods).
+In the $x$-axis, cell types/tissues are sorted by the maximum sample value.
+<!--  -->
+**b)** Gene-trait associations (unadjusted $p$-values from S-MultiXcan [@doi:10.1371/journal.pgen.1007889]; threshold at -log($p$)=10) and colocalization probability (fastENLOC) for the top traits in LV246.
+The top 40 genes in LV246 are shown, sorted by their LV weight (matrix $\mathbf{Z}$), from largest (the top gene *SCD*) to smallest (*FAR2*);
+*DGAT2* and *ACACA*, in boldface, are two of the six high-confidence genes in the lipids-increasing gene set from the CRISPR screen.
+Cardiovascular-related traits are in boldface.
+<!--  -->
+SGBS: Simpson Golabi Behmel Syndrome;
+CH2DB: CH<sub>2</sub> groups to double bonds ratio;
+HDL: high-density lipoprotein;
+RCP: locus regional colocalization probability.
+<!--  -->
+](images/lvs_analysis/lv246/lv246.svg "LV246 TWAS plot"){#fig:lv246 width="100%"}
+
+
+Next, we analyzed all 987 LVs using Fast Gene Set Enrichment Analysis (FGSEA) [@doi:10.1101/060012], and found 15 LVs nominally enriched (unadjusted *P* < 0.01) with these lipid-altering gene-sets (Tables @tbl:sup:lipids_crispr:modules_enriched_increase and @tbl:sup:lipids_crispr:modules_enriched_decrease).
+Among those with reliable sample metadata, LV246, the top LV associated with the lipids-increasing gene-set, contained genes mainly co-expressed in adipose tissue (Figure {@fig:lv246}a), which plays a key role in coordinating and regulating lipid metabolism.
+Using our regression framework across all traits in PhenomeXcan, we found that gene weights for this LV were predictive of gene associations for plasma lipids, high cholesterol, and Alzheimer's disease (Table @tbl:sup:phenomexcan_assocs:lv246, FDR < 1e-23).
+These lipids-related associations also replicated across the 309 traits in eMERGE (Table @tbl:sup:emerge_assocs:lv246), where LV246 was significantly associated with hypercholesterolemia (phecode: 272.11, FDR < 4e-9), hyperlipidemia (phecode: 272.1, FDR < 4e-7) and disorders of lipoid metabolism (phecode: 272, FDR < 4e-7).
+
+
+Two high-confidence genes from our CRISPR screening, *DGAT2* and *ACACA*, are responsible for encoding enzymes for triglycerides and fatty acid synthesis and were among the highest-weighted genes of LV246 (Figure {@fig:lv246}b, in boldface).
+However, in contrast to other members of LV246, *DGAT2* and *ACACA* were not associated nor colocalized with any of the cardiovascular-related traits and thus would not have been prioritized by TWAS alone;
+instead, other members of LV246, such as *SCD*, *LPL*, *FADS2*, *HMGCR*, and *LDLR*, were significantly associated and colocalized with lipid-related traits.
+This lack of association of two high-confidence genes from our CRISPR screen might be explained from an omnigenic point of view [@doi:10.1016/j.cell.2019.04.014].
+Assuming that the TWAS models for *DGAT2* and *ACACA* capture all common *cis*-eQTLs (the only genetic component of gene expression that TWAS can capture) and there are no rare *cis*-eQTLs, these two genes might represent "core" genes (i.e., they directly affect the trait with no mediated regulation of other genes), and many of the rest in the LV are "peripheral" genes that *trans*-regulate them.
+
diff --git a/tests/manuscripts/phenoplier_full/04.15.drug_disease_prediction.md b/tests/manuscripts/phenoplier_full/04.15.drug_disease_prediction.md
new file mode 100644
index 0000000..7018ca0
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/04.15.drug_disease_prediction.md
@@ -0,0 +1,94 @@
+### LVs predict drug-disease pairs better than single genes
+
+We next determined how substituting LVs for individual genes predicted known treatment-disease relationships.
+For this, we used the transcriptional responses to small molecule perturbations profiled in LINCS L1000 [@doi:10.1016/j.cell.2017.10.049], which were further processed and mapped to DrugBank IDs [@doi:10.1093/nar/gkt1068; @doi:10.7554/eLife.26726; @doi:10.5281/zenodo.47223].
+Based on an established drug repurposing strategy that matches reversed transcriptome patterns between genes and drug-induced perturbations [@doi:10.1126/scitranslmed.3002648; @doi:10.1126/scitranslmed.3001318], we adopted a previously described framework that uses imputed transcriptomes from TWAS to prioritize drug candidates [@doi:10.1038/nn.4618].
+For this, we computed a drug-disease score by calculating the negative dot product between the $z$-scores for a disease (from TWAS) and the $z$-scores for a drug (from LINCS) across sets of genes of different sizes (see [Methods](#sec:methods:drug)).
+Therefore, a large score for a drug-disease pair indicated that higher (lower) predicted expression values of disease-associated genes are down (up)-regulated by the drug, thus predicting a potential treatment.
+Similarly, for the LV-based approach, we estimated how pharmacological perturbations affected the gene module activity by projecting expression profiles of drugs into our latent representation (Figure {@fig:entire_process}b).
+We used a manually-curated gold standard set of drug-disease medical indications [@doi:10.7554/eLife.26726; @doi:10.5281/zenodo.47664] for 322 drugs across 53 diseases to evaluate the prediction performance.
+
+
+![
+**Drug-disease prediction performance for gene-based and LV-based approaches.**
+The receiver operating characteristic (ROC) (left) and the precision-recall curves (right) for a gene-based and LV-based approach.
+"Random" refers to the average precision of a hundred classifiers with randomly permuted scores, where the error band represents the 95% confidence interval.
+AUC: area under the curve; AP: average precision.
+](images/drug_disease_prediction/roc_pr_curves.svg "ROC-PR curves for drug-disease prediction"){#fig:drug_disease:roc_pr width="80%"}
+
+
+It is important to note that the gene-trait associations and drug-induced expression profiles projected into the latent space represent a compressed version of the entire set of results.
+Despite this information loss, the LV-based method outperformed the gene-based one with an area under the curve of 0.632 and an average precision of 0.858 (Figure @fig:drug_disease:roc_pr).
+The prediction results suggested that this low-dimensional space captures biologically meaningful patterns that can link pathophysiological processes with the mechanism of action of drugs.
+
+
+We examined a specific drug-disease pair to determine whether the LVs driving the prediction were biologically plausible.
+Nicotinic acid (niacin) is a B vitamin widely used clinically to treat lipid disorders, although there is controversy on its clinical utility in preventing cardiovascular disease [@pmid:22085343; @pmid:25014686; @pmid:30977858].
+Niacin exerts its effects on multiple tissues, although its mechanisms are not well understood [@doi:10.1016/j.amjcard.2008.02.029; @doi:10.1194/jlr.S092007; @pmid:24363242; @pmid:24713591].
+This compound can increase high-density lipoprotein (HDL) by inhibiting an HDL catabolism receptor in the liver.
+Niacin also inhibits diacylglycerol acyltransferase–2 (DGAT2), which decreases the production of low-density lipoproteins (LDL) either by modulating triglyceride synthesis in hepatocytes or by inhibiting adipocyte triglyceride lipolysis [@doi:10.1016/j.amjcard.2008.02.029].
+Niacin was one of the drugs in the gold standard set indicated for atherosclerosis (AT) and coronary artery disease (CAD).
+We observed that this compound was predicted by the gene-based and LV-based approach as a medical indication for coronary artery disease (CAD), with scores above the mean (0.51 and 0.96, respectively).
+For AT, the LV-based approach predicted niacin as a therapeutic drug with a score of 0.52, whereas the gene-based method assigned a negative score of -0.01 (below the mean).
+Since LVs represent interpretable features associated with specific cell types, we analyzed which LVs positively contributed to these predictions (i.e., with an opposite direction between niacin and the disease).
+Notably, LV246 (Figure @fig:lv246), expressed in adipose tissue and liver and associated with plasma lipids and high cholesterol (Table @tbl:sup:phenomexcan_assocs:lv246), was the 16th most important module in the prediction of niacin as a therapeutic drug for AT.
+Besides the gold standard set, LV246 was among the top modules for other cardiovascular diseases, such as ischaemic heart disease (wide definition, 15th module) and high cholesterol (7th module).
+
+![
+**Top cell types/tissues where LV116's genes are expressed.**
+Values in the $y$-axis come from matrix $\mathbf{B}$ in the MultiPLIER models (Figure {@fig:entire_process}b).
+In the $x$-axis, cell types/tissues are sorted by the maximum sample value.
+The figure shows a clear immune response with cell types under different stimuli.
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP045500 -->
+MS: multiple sclerosis;
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP045569 -->
+<!-- PBMCs: peripheral blood mononuclear cells; -->
+HSV: treated with herpes simplex virus;
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP015670 -->
+WNV: infected with West Nile virus;
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP062958 -->
+IFNa: treated with interferon-alpha;
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP039361 -->
+HMDM: human peripheral blood mononuclear cell-derived macrophages;
+<!-- IPSDM: human induced pluripotent stem cell-derived macrophages; -->
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP056733 -->
+Salm: infected with *Salmonella typhimurium*;
+Yers: infected with *Yersinia pseudotuberculosis*;
+<!-- https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP062966 -->
+ISM: Interferon Signature Metric;
+SLE: Systemic lupus erythematosus.
+](images/lvs_analysis/lv116/lv116-cell_types.svg "LV116 cell types"){#fig:lv116:cell_types width="100%"}
+
+
+
+The analysis of other top niacin-contributing LVs across different cardiovascular diseases revealed additional mechanisms of action.
+For example, *GPR109A/HCAR2* encodes a G protein-coupled high-affinity niacin receptor in adipocytes and immune cells, including monocytes, macrophages, neutrophils and dendritic cells [@doi:10.1016/j.tips.2006.05.008; @doi:10.1038/sj.jid.5700586].
+It was initially thought that the antiatherogenic effects of niacin were solely due to the inhibition of lipolysis in adipose tissue.
+However, it has been shown that nicotinic acid can reduce atherosclerosis progression independently of its antidyslipidemic activity by activating *GPR109A* in immune cells [@doi:10.1172/JCI41651], thus boosting anti-inflammatory processes [@doi:10.1161/ATVBAHA.108.179283].
+In addition, flushing, a common adverse effect of niacin, is also produced by the activation of GPR109A in Langerhans cells (macrophages of the skin).
+This alternative mechanism for niacin could have been hypothesized by examining the cell types where the top-contributing modules are expressed:
+for instance, LV116 and LV931 (Figures @fig:lv116:cell_types and @fig:sup:lv931, and Tables @tbl:sup:multiplier_pathways:lv116 and @tbl:sup:multiplier_pathways:lv931) were the top two modules for AT, with a strong signature in monocytes, macrophages, neutrophils, dendritic cells, among others.
+In Figure @fig:lv116:cell_types, it can be seen that LV116's genes are expressed as an immune response when these cell types are under different stimuli, such as diarrhea caused by different pathogens [@doi:10.1371/journal.pone.0192082], samples from multiple sclerosis or systemic lupus erythematosus [@doi:10.1371/journal.pone.0109760; @doi:10.1126/science.aac7442], or infected with different viruses (such as herpes simplex [@url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA258384], West Nile virus [@doi:10.3390/v5071664], *Salmonella typhimurium* [@doi:10.1038/srep16882], among others).
+These three LVs (LV246, LV116 and LV931) were among the top 20 modules contributing to the niacin prediction across different cardiovascular traits (Table @tbl:niacin:cardio:top_lvs).
+
+
+<!-- niacin:cardiovascular:top_lvs:start DISABLE NOW, BUT HAS TO BE ADDED THE "end" version just below to update table -->
+| LV    | Cell type             | Disease                                      |
+|:------|:----------------------|:---------------------------------------------|
+| LV116 | Immune cells, skin    | Atherosclerosis (ICD10 I70)                  |
+|       |                       | Chronic ischaemic heart disease (ICD10 I25)  |
+|       |                       | Heart attack, angina, stroke or hypertension |
+|       |                       | Ischaemic heart disease (wide definition)    |
+| LV931 | Immune cells          | Atherosclerosis (ICD10 I70)                  |
+|       |                       | Heart attack, angina, stroke or hypertension |
+|       |                       | Ischaemic heart disease (wide definition)    |
+| LV246 | Adipose tissue, liver | Atherosclerosis (ICD10 I70)                  |
+|       |                       | High cholesterol (self-reported)             |
+|       |                       | Ischaemic heart disease (wide definition)    |
+
+Table: **LVs among the top 20 contributors to the prediction of niacin for five cardiovascular diseases.** "Heart attack, angina, stroke or hypertension" refers to the UK Biobank data-field 6150. GWAS sample size: Atherosclerosis (361,194 in total and 566 cases), Chronic ischaemic heart disease (361,194 in total and 12,769 cases), Heart attack, angina, stroke or hypertension (360,420 in total and 253,565 cases), Ischaemic heart disease/wide definition (361,194 in total and 20,857 cases), High cholesterol/self-reported (361,141 in total and 43,957 cases).  {#tbl:niacin:cardio:top_lvs}
+
+
+Beyond cardiovascular traits, there are other potentially interesting LVs that could extend our understanding of the mechanisms of niacin.
+For example, LV66, one of the top LVs affected by niacin (Figure @fig:sup:lv66), was mainly expressed in ovarian granulosa cells.
+This compound has been very recently considered a potential therapeutic for ovarian diseases [@doi:10.1159/000495051; @doi:10.1071/RD20306], as it was found to promote follicle growth and inhibit granulosa cell apoptosis in animal models.
diff --git a/tests/manuscripts/phenoplier_full/04.20.00.traits_clustering.md b/tests/manuscripts/phenoplier_full/04.20.00.traits_clustering.md
new file mode 100644
index 0000000..6b5571a
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/04.20.00.traits_clustering.md
@@ -0,0 +1,95 @@
+### LVs reveal trait clusters with shared transcriptomic properties
+
+![
+**Cluster analysis on traits using the latent gene expression representation.**
+**a)** The projection of TWAS results on 3,752 traits into the latent gene expression representation is the input data to the clustering process.
+A linear (PCA) and non-linear (UMAP) dimensionality reduction techniques were applied to the input data, and five different clustering algorithms processed all data versions.
+These algorithms derive partitions from the data using different parameters (such as the number of clusters), leading to an ensemble of 4,428 partitions.
+Then, a distance matrix is derived by counting how many times a pair of traits was grouped in different clusters across the ensemble.
+Finally, a consensus function is applied to the distance matrix to generate consolidated partitions with different numbers of clusters (from 2 to $\sqrt{n}\approx$ 60).
+These final solutions were represented in the clustering tree (Figure @fig:clustering:tree).
+**b)** The clusters found by the consensus function were used as labels to train a decision tree classifier on the original input data, which detects the LVs that better differentiate groups of traits.
+](images/clustering/clustering_design.svg "Cluster analysis on traits"){#fig:clustering:design width="100%"}
+
+
+We used the projection of gene-trait associations into the latent space to find groups of clusters linked by the same transcriptional processes.
+Since individual clustering algorithms have different biases (i.e., assumptions about the data structure), we designed a consensus clustering framework that combines solutions or partitions of traits generated by different methods ([Methods](#sec:methods:clustering)).
+Consensus or ensemble approaches have been recommended to avoid several pitfalls when performing cluster analysis on biological data [@doi:10.1126/scisignal.aad1932].
+Since diversity in the ensemble is crucial for these methods, we generated different data versions which were processed using different methods with varying sets of parameters (Figure {@fig:clustering:design}a).
+Then, a consensus function combines the ensemble into a consolidated solution, which has been shown to outperform any individual member of the ensemble [@Strehl2002; @doi:10.1109/TPAMI.2005.113].
+Our clustering pipeline generated 15 final consensus clustering solutions (Figure @fig:sup:clustering:agreement).
+The number of clusters of these partitions (between 5 to 29) was learned from the data by selecting the partitions with the largest agreement with the ensemble [@Strehl2002].
+Instead of selecting one of these final solutions with a specific number of clusters, we used a clustering tree [@doi:10.1093/gigascience/giy083] (Figure @fig:clustering:tree) to examine stable groups of traits across multiple resolutions.
+To understand which latent variables differentiated the group of traits, we trained a decision tree classifier on the input data $\hat{\mathbf{M}}$ using the clusters found as labels (Figure {@fig:clustering:design}b, see [Methods](#sec:methods:clustering)).
+
+
+![
+**Clustering tree using multiple resolutions for clusters of traits.**
+Each row represents a partition/grouping of the traits, and each circle is a cluster from that partition.
+The number of clusters goes from 5 to 29.
+Arrows indicate how traits in one cluster move across clusters from different partitions.
+Most of the clusters are preserved across different resolutions, showing highly stable solutions even with independent runs of the clustering algorithm.
+<!--  -->
+RDW: red cell (erythrocyte) distribution width;
+BMI: body mass index;
+WC: waist circumference;
+HC: hip circumference;
+RA: rheumatoid arthritis;
+SLE: systemic lupus erythematosus;
+HTN: Hypertension;
+IBD: inflammatory bowel disease;
+SCZ: Schizophrenia;
+CAD: Coronary artery disease;
+AD: Alzheimer's disease;
+<!--  -->
+The full lists of traits in each cluster in the last five partitions of the tree (from $k=16$ to $k=29$) are in Supplementary Data 3-7.
+](images/clustering/clustering_tree.svg "Clustering tree on groups of traits"){#fig:clustering:tree width="100%"}
+
+
+We found that phenotypes were grouped into five clear branches, defined by their first node at the top of the Figure @fig:clustering:tree:
+0) a "large" branch that includes most of the traits subdivided only starting at $k$=16 (with asthma, subjective well-being traits, and nutrient intake clusters),
+1) heel bone-densitometry measurements,
+2) hematological assays on red blood cells,
+3) physical measures, including spirometry and body impedance, and anthropometric traits with fat-free and fat mass measures in separate sub-branches, and
+4) a "complex" branch including keratometry measurements, assays on white blood cells and platelets, skin and hair color traits, autoimmune disorders, and cardiovascular diseases (which also included other cardiovascular-related traits such as hand-grip strength [@pmid:25982160], and environmental/behavioral factors such as physical activity and diet) (see Supplementary Data 3-7 for all clustering results).
+Within these branches, results were relatively stable, with the same traits often clustered together across different resolutions.
+Arrows between clusters show traits moving from one group to another, and this mainly happens between clusters within the "complex" branch (4) and between clusters from the "large" branch (0) to the "complex" branch.
+This behavior is expected since complex diseases are usually associated with shared genetic and environmental factors and are thus hard to categorize into a single cluster.
+
+
+![
+**Cluster-specific and general transcriptional processes associated with different diseases.**
+The plot shows a submatrix of $\hat{\mathbf{M}}$ for the main trait clusters at $k$=29, considering only LVs (rows) that are well-aligned with at least one pathway.
+](images/clustering/global_clustermap-plain.svg "Heatmap with gene modules and traits"){#fig:clustering:heatmap width="100%"}
+
+
+Next, we analyzed which LVs were driving these clusters of traits.
+For this, we trained decision tree classifiers on the input data using each cluster at $k$=29 (bottom of Figure @fig:clustering:tree) as labels (see [Methods](#sec:methods:clustering)).
+This procedure yielded the top LVs that were most discriminative for each cluster.
+Several of these LVs were well-aligned to existing pathways (Figure @fig:clustering:heatmap), whereas others were not aligned to prior knowledge but still expressed in relevant tissues (Figure @fig:sup:clustering:novel:heatmap).
+In Figure @fig:clustering:heatmap, it can be seen that some LVs were highly specific to certain traits, while others were associated with a wide range of different phenotypes, thus potentially involved in more general biological functions.
+We used our regression framework to determine whether these LVs were significantly associated with different traits.
+For example, LVs such as LV928 and LV30, which were well-aligned to early progenitors of the erythrocytes lineage [@doi:10.1016/j.cell.2011.01.004] (Tables @tbl:sup:multiplier_pathways:lv928 and @tbl:sup:multiplier_pathways:lv30), were predominantly expressed in early differentiation stages of erythropoiesis (Figures @fig:sup:lv928 and @fig:sup:lv30) and strongly associated with different assays on red blood cells (FDR < 0.05; Tables @tbl:sup:phenomexcan_assocs:lv928, @tbl:sup:emerge_assocs:lv928, and @tbl:sup:emerge_assocs:lv30).
+In contrast, other LVs were highly specific, such as LV730, which is expressed in thrombocytes from different cancer samples (Figure @fig:sup:lv730 and Table @tbl:sup:multiplier_pathways:lv730), and strongly associated with hematological assays on platelets (FDR < 0.05, Table @tbl:sup:phenomexcan_assocs:lv730);
+or LV598, whose genes were expressed in corneal endothelial cells (Figure @fig:sup:lv598 and Table @tbl:sup:multiplier_pathways:lv598) and associated with keratometry measurements (Table @tbl:sup:phenomexcan_assocs:lv598).
+
+
+The sub-branches of autoimmune and cardiovascular diseases merged together at $k=10$ (middle of Figure @fig:clustering:tree), so we expected to find LVs that specifically affect one or both of these types of diseases.
+For example, LV57, expressed in T cells (Figure @fig:sup:lv57 and Table @tbl:sup:multiplier_pathways:lv57), was the most strongly associated gene module with autoimmune disorders in PhenomeXcan (Table @tbl:sup:phenomexcan_assocs:lv57), with significant associations with hypothyroidism that were replicated in eMERGE (Table @tbl:sup:emerge_assocs:lv57).
+However, this LV was also strongly associated with deep venous thrombosis in both PhenomeXcan and eMERGE.
+On the other hand, LV844 was more autoimmune-specific, with associations to polymyalgia rheumatica, type 1 diabetes, rheumatoid arthritis, and celiac disease in PhenomeXcan (Table @tbl:sup:phenomexcan_assocs:lv844).
+However, these did not replicate in eMERGE.
+This LV was expressed in a wide range of cell types, including blood, breast organoids, myeloma cells, lung fibroblasts, and different cell types from the brain (Figure @fig:sup:lv844 and Table @tbl:sup:multiplier_pathways:lv844).
+
+
+The cardiovascular sub-branch had 129 significant LV-trait associations in PhenomeXcan and 23 in eMERGE.
+LV136, aligned with known collagen formation and muscle contraction pathways (Table @tbl:sup:multiplier_pathways:lv136), was associated with coronary artery disease and keratometry measurements in PhenomeXcan (Table @tbl:sup:phenomexcan_assocs:lv136).
+In eMERGE, this LV was associated with coronary atherosclerosis (phecode: 411.4) (Table @tbl:sup:emerge_assocs:lv136).
+LV136 was expressed in a wide range of cell types, including fibroblasts, mesenchymal stem cells, osteoblasts, pancreatic stellate cells, cardiomyocytes, and adipocytes (Figure @fig:sup:lv136).
+Within the cardiovascular sub-branch, we found neuropsychiatric and neurodevelopmental disorders such as Alzheimer's disease, schizophrenia, and attention deficit hyperactivity disorder (ADHD).
+These disorders were previously linked to the cardiovascular system [@pmid:12093424; @doi:10.1161/CIRCULATIONAHA.113.002065; @doi:10.1192/bjp.bp.117.202606; @doi:10.1161/CIRCRESAHA.118.313563] and share several risk factors, including hypertension, high cholesterol, obesity, smoking, among others [@doi:10.1186/s12916-014-0206-2; @doi:10.1111/j.1076-7460.2007.06696.x].
+However, our results grouped these diseases by potentially shared transcriptional processes expressed in specific tissues/cell types.
+Alzheimer's disease (not present in eMERGE), for instance, was significantly associated with LV21 in PhenomeXcan (Table @tbl:sup:phenomexcan_assocs:lv21).
+LV21, a gene module not aligned to prior pathways, was strongly expressed in a variety of soft tissue sarcomas, monocytes/macrophages (including microglia from cortex samples), and aortic valves (Figure @fig:sup:lv21 and Table @tbl:sup:multiplier_pathways:lv21).
+This LV was also strongly associated with lipids and high cholesterol in PhenomeXcan and hyperlipidemia (phecode: 272.1) in eMERGE (Table @tbl:sup:emerge_assocs:lv21).
+As discussed previously, macrophages play a key role in the reverse cholesterol transport and thus atherogenesis [@doi:10.1093/qjmed/hci136], and lipid metabolism in microglia has been recently identified as an important factor in the development of neurodegenerative diseases [@doi:10.3389/fphys.2020.00393].
diff --git a/tests/manuscripts/phenoplier_full/05.discussion.md b/tests/manuscripts/phenoplier_full/05.discussion.md
new file mode 100644
index 0000000..480c23c
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/05.discussion.md
@@ -0,0 +1,77 @@
+## Discussion
+
+We have introduced a novel computational strategy that integrates statistical associations from TWAS with groups of genes (gene modules) that have similar expression patterns across the same cell types.
+Our key innovation is that we project gene-trait associations through a latent representation derived not strictly from measures of normal tissue but also from cell types under a variety of stimuli and at various developmental stages.
+This improves interpretation by going beyond statistical associations to infer cell type-specific features of complex phenotypes.
+Our approach can identify disease-relevant cell types from summary statistics, and several disease-associated gene modules were replicated in eMERGE.
+Using a CRISPR screen to analyze lipid regulation, we found that our gene module-based approach can prioritize causal genes even when single gene associations are not detected.
+We interpret these findings with an omnigenic perspective of "core" and "peripheral" genes, suggesting that the approach can identify genes that directly affect the trait with no mediated regulation of other genes and thus prioritize alternative and potentially more attractive therapeutic targets.
+
+
+Using our gene module perspective, we also integrated drug-induced transcriptional profiles, which allowed us to connect diseases, drugs, and cell types.
+We showed that the LV-based drug-repurposing approach outperformed the gene-based one when predicting drug-disease links for 322 drugs across 53 diseases.
+Furthermore, and beyond statistical prediction, we focused on cardiovascular traits and a particular drug, niacin, to show that the approach connects pathophysiological processes with known mechanisms of action, including those in adipose tissue, immune cells, and ovarian granulosa cells.
+Our LV-based approach could be helpful in generating novel hypotheses to evaluate potential mechanisms of action, or even adverse effects, of known or experimental drugs.
+
+
+We found that the analysis of associations through latent representations provided reasonable groupings of diseases and traits affected by shared and distinct transcriptional mechanisms expressed in highly relevant tissues.
+Our cluster analysis approach also detected the LVs that were most discriminative for each cluster.
+Several of these LVs were also significantly associated with different traits.
+Some LVs were strongly aligned with known pathways, but others (like LV57) were not, which might represent novel disease-relevant mechanisms.
+In some cases, the features/LVs linked to phenotypes appear to be associated with specific cell types.
+Associations with such cell type marker genes may reveal potentially causal cell types for a phenotype with more precision.
+We observed modules expressed primarily in one tissue (such as adipose in LV246 or ovary in LV66).
+Others appeared to be expressed in many contexts, which may capture pathways associated with related complex diseases.
+For example, LV136 is associated with cardiovascular disease and measures of corneal biomechanics and is expressed in fibroblasts, osteoblasts, pancreas, liver, and cardiomyocytes, among others.
+Other examples include LV844, expressed in whole blood samples and associated with a range of autoimmune diseases;
+or LV57, which is clearly expressed in T cells and strongly associated with autoimmune and venous thromboembolism.
+From an omnigenic point of view, these patterns might represent cases of "network pleiotropy," where the same cell types mediate molecularly related traits.
+To our knowledge, projection through a representation learned on complementary but distinct datasets is a novel approach to identifying cell type and pathway effects on complex phenotypes that is computationally simple to implement.
+
+
+We also demonstrated that clustering trees, introduced initially as a means to examine developmental processes in single-cell data, provide a multi-resolution grouping of phenotypes based on latent variable associations.
+We employed hard-partitioning algorithms (one trait belongs exclusively to one cluster) where the distance between two traits takes into account all gene modules.
+However, it is also plausible for two complex diseases to share only a few biological processes instead of being similar across most of them.
+Another important consideration is that our TWAS results were derived from a large set of GWAS of different sample sizes and qualities.
+Although the potential issues derived from this data heterogeneity were addressed before performing our cluster analyses on traits, data preprocessing steps are always challenging and might not avoid bias altogether.
+Considering groups of related diseases was previously shown to be more powerful in detecting shared genetic etiology [@doi:10.1038/ng.3985; @doi:10.1038/s41588-018-0121-0], and clustering trees provide a way to explore such relationships in the context of latent variables.
+
+
+Finally, we developed an LV-based regression framework to detect whether gene modules are associated with a trait using TWAS $p$-values.
+We used PhenomeXcan as a discovery cohort across four thousand traits, and many LV-trait associations replicated in eMERGE.
+In PhenomeXcan, we found 3,450 significant LV-trait associations (FDR < 0.05) with 686 LVs (out of 987) associated with at least one trait and 1,176 traits associated with at least one LV.
+In eMERGE, we found 196 significant LV-trait associations, with 116 LVs associated with at least one trait/phecode and 81 traits with at least one LV.
+We only focused on a few disease types from our trait clusters, but the complete set of associations on other disease domains is available in our [Github repository](https://github.com/greenelab/phenoplier) for future research.
+As noted in [Methods](#sec:methods:reg), one limitation of the regression approach is that the gene-gene correlations are only approximately accurate, which could lead to false positives if the correlation among the top genes in a module is not precisely captured.
+The regression model, however, is approximately well-calibrated, and we did not observe inflation when running the method in real data.
+
+
+Our approach rests on the assumption that gene modules with coordinated expression patterns will also manifest coordinated pathological effects.
+Our implementation in this work integrates two complementary approaches.
+The first is MultiPLIER, which extracts latent variables from large expression datasets, and these LVs could represent either real transcriptional processes or technical factors ("batch effects").
+We used a previously published model derived from recount2, which was designed to analyze rare disorders but might not be the optimal latent representation for the wide range of complex diseases considered here.
+Also, the underlying factorization method rests on linear combinations of variables, which could miss important and more complex co-expression patterns.
+In addition, recount2, the training dataset used, has since been surpassed in size and scale by other resources [@doi:10.1038/s41467-018-03751-6; @doi:10.1101/2021.05.21.445138].
+However, it is important to note that our models impose very few assumptions on the latent expression representation.
+Therefore, we should be able to easily replace MultiPLIER with other similar approaches like GenomicSuperSignature [@doi:10.1038/s41467-022-31411-3].
+The second approach we used in this study is TWAS, where we are only considering the hypothesis that GWAS loci affect traits via changes in gene expression.
+Other effects, such as coding variants disrupting protein-protein interactions, are not captured.
+Additionally, TWAS has several limitations that can lead to false positives [@doi:10.1038/s41588-019-0385-z; @doi:10.1016/j.ajhg.2020.11.012].
+Like GWAS, which generally detects groups of associated variants in linkage disequilibrium (LD), TWAS usually identifies several genes within the same locus [@doi:10.1038/s41588-018-0092-1; @doi:10.1038/ng.3367].
+This is due to sharing of GWAS variants in gene expression models, correlated expression of nearby genes, or even correlation of their predicted expression due to eQTLs in LD, among others [@doi:10.1038/s41588-019-0385-z].
+Our LV-based regression framework, however, accounts for these gene-gene correlations in TWAS reasonably well.
+
+
+Our findings are concordant with previous studies showing that drugs with genetic support are more likely to succeed through the drug development pipeline [@doi:10.1038/ng.3314; @doi:10.1038/nn.4618].
+In this case, projecting association results through latent variables better prioritized disease-treatment pairs than considering single-gene effects alone.
+An additional benefit is that the latent variables driving predictions represent interpretable genetic features that can be examined to infer potential mechanisms of action.
+Here we prioritized drugs for diseases with very different tissue etiologies, and a challenge of the approach is to select the most appropriate tissue model from TWAS to find reversed transcriptome patterns between genes and drug-induced perturbations.
+
+
+Ultimately, the quality of the representations is essential to performance.
+Here we used a representation derived from a factorization of bulk RNA-seq data.
+Detailed perturbation datasets and single-cell profiling of tissues, with and without perturbagens, and at various stages of development provide an avenue to generate higher quality and more interpretable representations.
+On the other hand, the key to interpretability is driven by the annotation of sample metadata.
+New approaches to infer and annotate with structured metadata are promising and can be directly applied to existing data [@doi:10.1101/2021.05.10.443525].
+Rapid improvements in both areas set the stage for latent variable projections to be widely applied to disentangle the genetic basis of complex human phenotypes.
+By providing a new perspective for a mechanistic understanding of statistical associations from TWAS, our method can generate testable hypotheses for the post-GWAS functional characterization of complex diseases, which will likely be an area of great importance in the coming years.
diff --git a/tests/manuscripts/phenoplier_full/07.00.methods.md b/tests/manuscripts/phenoplier_full/07.00.methods.md
new file mode 100644
index 0000000..cd8fce1
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/07.00.methods.md
@@ -0,0 +1,408 @@
+## Methods {#sec:methods}
+
+PhenoPLIER is a framework that combines different computational approaches to integrate gene-trait associations and drug-induced transcriptional responses with groups of functionally-related genes (referred to as gene modules or latent variables/LVs).
+Gene-trait associations are computed using the PrediXcan family of methods, whereas latent variables are inferred by the MultiPLIER models applied on large gene expression compendia.
+PhenoPLIER provides
+1) a regression model to compute an LV-trait association,
+2) a consensus clustering approach applied to the latent space to learn shared and distinct transcriptomic properties between traits, and
+3) an interpretable, LV-based drug repurposing framework.
+We provide the details of these methods below.
+
+
+### The PrediXcan family of methods for gene-based associations {#sec:methods:predixcan}
+
+We used Summary-PrediXcan (S-PrediXcan) [@doi:10.1038/s41467-018-03621-1] and Summary-MultiXcan (S-MultiXcan) [@doi:10.1371/journal.pgen.1007889] as the gene-based statistical approaches, which belong to the PrediXcan family of methods [@doi:10.1038/ng.3367].
+We broadly refer to these approaches as TWAS (transcription-wide association studies).
+S-PrediXcan, the summary-based version of PrediXcan, computes the univariate association between a trait and a gene's predicted expression in a single tissue.
+In contrast, S-MultiXcan, the summary-based version of MultiXcan, computes the joint association between a gene's predicted expression in all tissues and a trait.
+S-PrediXcan and S-MultiXcan only need GWAS summary statistics instead of individual-level genotype and phenotype data.
+
+Here we briefly provide the details about these TWAS methods that are necessary to explain our regression framework later (see the referenced articles for more information).
+In the following, we refer to $\mathbf{y}$ as a vector of traits for $n$ individuals that is centered for convenience (so that no intercept is necessary);
+$\tilde{\mathbf{t}}_l = \sum_{a \in \mathrm{model}_l} w_{a}^{l} X_{a}$ is the gene's predicted expression for all individuals in tissue $l$, $X_a$ is the genotype of SNP $a$ and $w_{a}$ its weight in the tissue prediction model $l$;
+and $\mathbf{t}_l$ is the standardized version of $\tilde{\mathbf{t}}_l$ with mean equal to zero and standard deviation equal to one.
+
+S-PrediXcan [@doi:10.1038/s41467-018-03621-1] is the summary version of PrediXcan [@doi:10.1038/ng.3367].
+PrediXcan models the trait as a linear function of the gene's expression on a single tissue using the univariate model
+
+$$
+\mathbf{y} = \mathbf{t}_l \gamma_l + \bm{\epsilon}_l,
+$$ {#eq:predixcan}
+
+where $\hat{\gamma}_l$ is the estimated effect size or regression coefficient, and $\bm{\epsilon}_l$ are the error terms with variance $\sigma_{\epsilon}^{2}$.
+The significance of the association is assessed by computing the $z$-score $\hat{z}_{l}=\hat{\gamma}_l / \mathrm{se}(\hat{\gamma}_l)$ for a gene's tissue model $l$.
+PrediXcan needs individual-level data to fit this model, whereas S-PrediXcan approximates PrediXcan $z$-scores using only GWAS summary statistics with the expression
+
+$$
+\hat{z}_{l} \approx \sum_{a \in model_{l}} w_a^l \frac{\hat{\sigma}_a}{\hat{\sigma}_l} \frac{\hat{\beta}_a}{\mathrm{se}(\hat{\beta}_a)},
+$$ {#eq:spredixcan}
+
+where $\hat{\sigma}_a$ is the variance of SNP $a$, $\hat{\sigma}_l$ is the variance of the predicted expression of a gene in tissue $l$, and $\hat{\beta}_a$ is the estimated effect size of SNP $a$ from the GWAS.
+In these TWAS methods, the genotype variances and covariances are always estimated using the Genotype-Tissue Expression project (GTEx v8) [@doi:10.1126/science.aaz1776] as the reference panel.
+Since S-PrediXcan provides tissue-specific direction of effects (for instance, whether a higher or lower predicted expression of a gene confers more or less disease risk), we used the $z$-scores in our drug repurposing approach (described below).
+
+S-MultiXcan [@doi:10.1371/journal.pgen.1007889], on the other hand, is the summary version of MultiXcan.
+MultiXcan is more powerful than PrediXcan in detecting gene-trait associations, although it does not provide the direction of effects.
+Its main output is the $p$-value (obtained with an F-test) of the multiple tissue model
+
+$$
+\begin{split}
+\mathbf{y} & = \sum_{l=1}^{p} \mathbf{t}_l g_l + \mathbf{e} \\
+ & = \mathbf{T} \mathbf{g} + \mathbf{e},
+\end{split}
+$$ {#eq:multixcan}
+
+where $\mathbf{T}$ is a matrix with $p$ columns $\mathbf{t}_l$,
+$\hat{g}_l$ is the estimated effect size for the predicted gene expression in tissue $l$ (and thus $\hat{\mathbf{g}}$ is a vector with $p$ estimated effect sizes $\hat{g}_l$),
+and $\mathbf{e}$ are the error terms with variance $\sigma_{e}^{2}$.
+Given the high correlation between predicted expression values for a gene across different tissues, MultiXcan uses the principal components (PCs) of $\mathbf{T}$ to avoid collinearity issues.
+S-MultiXcan derives the joint regression estimates (effect sizes and their variances) in Equation (@eq:multixcan) using the marginal estimates from S-PrediXcan in Equation (@eq:spredixcan).
+Under the null hypothesis of no association, $\hat{\mathbf{g}}^{\top} \frac{\mathbf{T}^{\top}\mathbf{T}}{\sigma_{e}^{2}} \hat{\mathbf{g}} \sim \chi_{p}^{2}$, and therefore the significance of the association in S-MultiXcan is estimated with
+
+$$
+\begin{split}
+\frac{\hat{\mathbf{g}}^{\top} (\mathbf{T}^{\top}\mathbf{T}) \hat{\mathbf{g}}}{\sigma_{e}^{2}} & \approx \bm{\hat{\gamma}}^{\top} \frac{\sqrt{n-1}}{\sigma_{\epsilon}} \left(\frac{\mathbf{T}^{\top} \mathbf{T}}{n-1}\right)^{-1} \frac{\sqrt{n-1}}{\sigma_{\epsilon}} \bm{\hat{\gamma}} \\
+ & = \hat{\mathbf{z}}^{\top} Cor(\mathbf{T})^{-1} \hat{\mathbf{z}},
+\end{split}
+$$ {#eq:smultixcan}
+
+where $\hat{\mathbf{z}}$ is a vector with $p$ $z$-scores (Equation (@eq:spredixcan)) for each tissue available for the gene,
+and $Cor(\mathbf{T})$ is the autocorrelation matrix of $\mathbf{T}$.
+Since $\mathbf{T}^{\top}\mathbf{T}$ is singular for many genes, S-MultiXcan computes the pseudo-inverse $Cor(\mathbf{T})^{+}$ using the $k$ top PCs, and thus $\hat{\mathbf{z}}^{\top} Cor(\mathbf{T})^{+} \hat{\mathbf{z}} \sim \chi_k^2$.
+To arrive at this expression, S-MultiXcan uses the conservative approximation $\sigma_{e}^{2} \approx \sigma_{\epsilon}^{2}$, that is, the variance of the error terms in the joint regression is approximately equal to the residual variance of the marginal regressions.
+Another important point is that $Cor(\mathbf{T})$ is estimated using a global genotype covariance matrix, whereas marginal $\hat{z}_l$ in Equation (@eq:spredixcan) are approximated using tissue-specific genotype covariances.
+Although S-MultiXcan yields highly concordant estimates compared with MultiXcan, results are not perfectly correlated across genes [@doi:10.1371/journal.pgen.1007889].
+As we explain later, these differences are important for our LV-based regression model when computing the gene-gene correlation matrix.
+We used S-MultiXcan results for our LV-based regression model and our cluster analyses of traits.
+
+
+### TWAS resources {#sec:methods:twas}
+
+We used two large TWAS resources from different cohorts for discovery and replication, all obtained from European ancestries.
+PhenomeXcan [@doi:10.1126/sciadv.aba2083], our discovery cohort, provides results on 4,091 traits across different categories.
+Supplementary Data 1 has all the details about the included GWAS, sample size and disease/trait categories.
+In PhenomeXcan, these publicly available GWAS summary statistics were used to compute
+1) gene-based associations with the PrediXcan family of methods (described before), and
+2) a posterior probability of colocalization between GWAS loci and *cis*-eQTL with fastENLOC [@doi:10.1126/sciadv.aba2083; @doi:10.1016/j.ajhg.2020.11.012].
+We refer to the matrix of $z$-scores from S-PrediXcan (Equation (@eq:spredixcan)) across $q$ traits and $m$ genes in tissue $t$ as $\mathbf{M}^{t} \in \mathbb{R}^{q \times m}$.
+As explained later, matrices $\mathbf{M}^{t}$ were used in our LV-based drug repurposing framework since they provide direction of effects.
+The S-MultiXcan results (22,515 gene associations across 4,091 traits) were used in our LV-based regression framework and our cluster analyses of traits.
+For the cluster analyses, we used the $p$-values converted to $z$-scores: $\mathbf{M}=\Phi^{-1}(1 - p/2)$, where $\Phi^{-1}$ is the probit function.
+Higher $z$-scores correspond to stronger associations.
+
+Our discovery cohort was eMERGE [@doi:10.1038/gim.2013.72], where the same TWAS methods were run on 309 phecodes [@doi:10.1101/2021.10.21.21265225] across different categories (more information about traits are available in [@doi:10.1101/2021.10.21.21265225]).
+We used these results to replicate the associations found with our LV-based regression framework in PhenomeXcan.
+
+
+### MultiPLIER and Pathway-level information extractor (PLIER) {#sec:methods:multiplier}
+
+MultiPLIER [@doi:10.1016/j.cels.2019.04.003] extracts patterns of co-expressed genes from recount2 [@doi:10.1038/nbt.3838] (without including GTEx samples), a large gene expression dataset.
+The approach applies the pathway-level information extractor method (PLIER) [@doi:10.1038/s41592-019-0456-1], which performs unsupervised learning using prior knowledge (canonical pathways) to reduce technical noise.
+PLIER uses a matrix factorization approach that deconvolutes gene expression data into a set of latent variables (LV), where each LV represents a gene module.
+The MultiPLIER models reduced the dimensionality in recount2 to 987 LVs.
+
+Given a gene expression dataset $\mathbf{Y}^{m \times c}$ with $m$ genes and $c$ experimental conditions and a prior knowledge matrix $\mathbf{C} \in \{0,1\}^{m \times p}$ for $p$ MSigDB pathways [@doi:10.1016/j.cels.2015.12.004] (so that $\mathbf{C}_{ij} = 1$ if gene $i$ belongs to pathway $j$), PLIER finds $\mathbf{U}$, $\mathbf{Z}$, and $\mathbf{B}$ minimizing
+
+$$
+||\mathbf{Y} - \mathbf{Z}\mathbf{B}||^{2}_{F} + \lambda_1 ||\mathbf{Z} - \mathbf{C}\mathbf{U}||^{2}_{F} + \lambda_2 ||\mathbf{B}||^{2}_{F} + \lambda_3 ||\mathbf{U}||_{L^1}
+$$ {#eq:met:plier_func}
+
+subject to $\mathbf{U}>0, \mathbf{Z}>0$;
+$\mathbf{Z}^{m \times l}$ are the gene loadings with $l$ latent variables,
+$\mathbf{B}^{l \times c}$ is the latent space for $c$ conditions,
+$\mathbf{U}^{p \times l}$ specifies which of the $p$ prior-information pathways in $\mathbf{C}$ are represented for each LV,
+and $\lambda_i$ are different regularization parameters used in the training step.
+$\mathbf{Z}$ is a low-dimensional representation of the gene space where each LV aligns as much as possible to prior knowledge, and it might represent either a known or novel gene module (i.e., a meaningful biological pattern) or noise.
+
+For our drug repurposing and cluster analyses, we used this model to project gene-trait (from TWAS) and gene-drug associations (from LINCS L1000) into this low-dimensional gene module space.
+For instance, TWAS associations $\mathbf{M}$ (either from S-PrediXcan or S-MultiXcan) were projected using
+
+$$
+\hat{\mathbf{M}} = (\mathbf{Z}^{\top} \mathbf{Z} + \lambda_{2} \mathbf{I})^{-1} \mathbf{Z}^{\top} \mathbf{M},
+$$ {#eq:proj}
+
+where $\hat{\mathbf{M}}^{l \times q}$ is a matrix where traits are represented by gene modules instead of single genes.
+As explained later, we used the same approach to project drug-induced transcriptional profiles in LINCS L1000 to obtain a representation of drugs using gene modules.
+
+
+### Regression model for LV-trait associations {#sec:methods:reg}
+
+We adapted the gene-set analysis framework from MAGMA [@doi:10.1371/journal.pcbi.1004219] to TWAS.
+We used a competitive test to predict gene-trait associations from TWAS using gene weights from an LV, testing whether top-weighted genes for an LV are more strongly associated with the phenotype than other genes with relatively small or zero weights.
+Thus, we fit the model
+
+$$
+\mathbf{m}=\beta_{0} + \mathbf{s} \beta_{s} + \sum_{i} \mathbf{x}_{i} \beta_{i} + \bm{\epsilon},
+$$ {#eq:reg:model}
+
+where $\mathbf{m}$ is a vector of S-MultiXcan gene $p$-values for a trait (with a $-log_{10}$ transformation);
+$\mathbf{s}$ is a binary indicator vector with $s_{\ell}=1$ for the top 1% of genes with the largest loadings for LV $\ell$ (from $\mathbf{Z}_{\ell}$) and zero otherwise;
+$\mathbf{x}_{i}$ is a gene property used as a covariate;
+$\beta$ are effect sizes (with $\beta_{0}$ as the intercept);
+and $\bm{\epsilon} \sim \mathrm{MVN}(0, \sigma^{2} \mathbf{R})$ is a vector of error terms with a multivariate normal distribution (MVN) where $\mathbf{R}$ is the matrix of gene correlations.
+
+The model tests the null hypothesis $\beta_{s} = 0$ against the one-sided hypothesis $\beta_{s} > 0$.
+Therefore, $\beta_{s}$ reflects the difference in trait associations between genes that are part of LV $\ell$ and genes outside of it.
+Following the MAGMA framework, we used two gene properties as covariates:
+1) *gene size*, defined as the number of PCs retained in S-MultiXcan,
+and 2) *gene density*, defined as the ratio of the number of PCs to the number of tissues available.
+
+Since the error terms $\bm{\epsilon}$ could be correlated, we cannot assume they have independent normal distributions as in a standard linear regression model.
+In the PrediXcan family of methods, the predicted expression of a pair of genes could be correlated if they share eQTLs or if these are in LD [@doi:10.1038/s41588-019-0385-z].
+Therefore, we used a generalized least squares approach to account for these correlations.
+The gene-gene correlation matrix $\mathbf{R}$ was approximated by computing the correlations between the model sum of squares (SSM) for each pair of genes under the null hypothesis of no association.
+These correlations are derived from the individual-level MultiXcan model (Equation (@eq:multixcan)), where the predicted expression matrix $\mathbf{T}_{i} \in \mathbb{R}^{n \times p_i}$ of a gene $i$ across $p_i$ tissues is projected into its top $k_i$ PCs, resulting in matrix $\mathbf{P}_{i} \in \mathbb{R}^{n \times k_i}$.
+From the MAGMA framework, we know that the SSM for each gene is proportial to $\mathbf{y}^{\top} \mathbf{P}_{i} \mathbf{P}_{i}^{\top} \mathbf{y}$.
+Under the null hypothesis of no association, the covariances between the SSM of genes $i$ and $j$ is therefore given by $2 \times \mathrm{Trace}(\mathbf{P}_{i}^{\top} \mathbf{P}_{j} \mathbf{P}_{j}^{\top} \mathbf{P}_{i})$.
+The standard deviations of each SSM are given by $\sqrt{2 \times k_{i}} \times (n - 1)$.
+Therefore, the correlation between the SSMs for genes $i$ and $j$ can be written as follows:
+
+$$
+\begin{split}
+\mathbf{R}_{ij} & = \frac{2 \times \mathrm{Tr}(\mathbf{P}_{i}^{\top} \mathbf{P}_{j} \mathbf{P}_{j}^{\top} \mathbf{P}_{i})}{\sqrt{2 \times k_{i}} \times \sqrt{2 \times k_{j}} \times (n - 1)^2} \\
+& = \frac{2 \times \mathrm{Tr}(Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) \times Cor(\mathbf{P}_{j}, \mathbf{P}_{i}))}{\sqrt{2 \times k_{i}} \times \sqrt{2 \times k_{j}}},
+\end{split}
+$$ {#eq:reg:r}
+
+where columns $\mathbf{P}$ are standardized,
+$\mathrm{Tr}$ is the trace of a matrix,
+and the cross-correlation matrix between PCs $Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) \in \mathbb{R}^{k_i \times k_j}$ is given by
+
+$$
+\begin{split}
+Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) & = Cor(\mathbf{T}_{i} \mathbf{V}_{i}^{\top} \mathrm{diag}(\lambda_i)^{-1/2}, \mathbf{T}_{j} \mathbf{V}_{j}^{\top} \mathrm{diag}(\lambda_j)^{-1/2}) \\
+& = \mathrm{diag}(\lambda_i)^{-1/2} \mathbf{V}_{i} (\frac{\mathbf{T}_{i}^{\top} \mathbf{T}_{j}}{n-1}) \mathbf{V}_{j}^{\top} \mathrm{diag}(\lambda_j)^{-1/2},
+\end{split}
+$$ {#eq:reg:cor_pp}
+
+where $\frac{\mathbf{T}_{i}^{\top} \mathbf{T}_{j}}{n-1} \in \mathbb{R}^{p_i \times p_j}$ is the cross-correlation matrix between the predicted expression levels of genes $i$ and $j$,
+and columns of $\mathbf{V}_{i}$ and scalars $\lambda_i$ are the eigenvectors and eigenvalues of $\mathbf{T}_{i}$, respectively.
+S-MultiXcan keeps only the top eigenvectors using a condition number threshold of $\frac{\max(\lambda_i)}{\lambda_i} < 30$.
+To estimate the correlation of predicted expression levels for genes $i$ in tissue $k$ and gene $j$ in tissue $l$, $(\mathbf{t}_k^i, \mathbf{t}_l^j)$ ($\mathbf{t}_k^i$ is the $k$th column of $\mathbf{T}_{i}$), we used [@doi:10.1371/journal.pgen.1007889]
+
+$$
+\begin{split}
+\frac{(\mathbf{T}_{i}^{\top} \mathbf{T}_{j})_{kl}}{n-1} & = Cor(\mathbf{t}_k^i, \mathbf{t}_l^j) \\
+  & = \frac{ Cov(\mathbf{t}_k, \mathbf{t}_l) } { \sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
+  & = \frac{ Cov(\sum_{a \in \mathrm{model}_k} w_a^k X_a, \sum_{b \in \mathrm{model}_l} w_b^l X_b) }  {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
+  & = \frac{ \sum_{\substack{a \in \mathrm{model}_k \\ b \in \mathrm{model}_l}} w_a^k w_b^l Cov(X_a, X_b)} {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
+  & = \frac{ \sum_{\substack{a \in \mathrm{model}_k \\ b \in \mathrm{model}_l}} w_a^k w_b^l \Gamma_{ab}} {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} },
+\end{split}
+$$ {#eq:reg:corr_genes}
+
+where $X_a$ is the genotype of SNP $a$,
+$w_a^k$ is the weight of SNP $a$ for gene expression prediction in the tissue model $k$,
+and $\Gamma = \widehat{\mathrm{var}}(\mathbf{X}) = (\mathbf{X} - \bar{\mathbf{X}})^{\top} (\mathbf{X} - \bar{\mathbf{X}}) / (n-1)$ is the genotype covariance matrix using GTEx v8 as the reference panel, which is the same used in all TWAS methods described here.
+The variance of the predicted expression values of gene $i$ in tissue $k$ is estimated as [@doi:10.1038/s41467-018-03621-1]:
+
+$$
+\begin{split}
+\widehat{\mathrm{var}}(\mathbf{t}_k^i) & = (\mathbf{W}^k)^\top \Gamma^k \mathbf{W}^k \\
+        & = \sum_{\substack{a \in \mathrm{model}_k \\ b \in \mathrm{model}_k}} w_a^k w_b^k \Gamma_{ab}^k.
+\end{split}
+$$ {#eq:reg:var_gene}
+
+Note that, since we used the MultiXcan regression model (Equation (@eq:multixcan)), $\mathbf{R}$ is only an approximation of gene correlations in S-MultiXcan.
+As explained before, S-MultiXcan approximates the joint regression parameters in MultiXcan using the marginal regression estimates from S-PrediXcan in (@eq:spredixcan) with some simplifying assumptions and different genotype covariance matrices.
+This complicates the derivation of an S-MultiXcan-specific solution to compute $\mathbf{R}$.
+To account for this, we used a submatrix $\mathbf{R}_{\ell}$ corresponding to genes that are part of LV $\ell$ only (top 1% of genes) instead of the entire matrix $\mathbf{R}$.
+This simplification is conservative since correlations are accounted for top genes only.
+Our simulations ([Supplementary Note 1](#sm:reg:null_sim)) show that the model is approximately well-calibrated and can correct for LVs with adjacent and highly correlated genes at the top (e.g., Figure @fig:reg:nulls:qqplot:lv234).
+The simulation also identified 127 LVs in which the model was not well-calibrated (e.g., Figure @fig:reg:nulls:qqplot:lv914).
+As this can be attributed to limitations in accurately computing a gene correlation matrix, we excluded these LVs from our main analyses.
+
+In Equation (@eq:reg:corr_genes), for each gene, we only considered tissue models present in S-PrediXcan results, as well as SNPs present in GWAS used as input for the TWAS approaches.
+This is necessary to obtain more accurate correlation estimates [@doi:10.1371/journal.pgen.1007889].
+Therefore, we computed different correlation matrices for PhenomeXcan and eMERGE.
+In PhenomeXcan, most of the GWAS (4,049) were obtained from the UK Biobank using the same pipeline and including the same set of SNPs, so a single correlation matrix was used for this set.
+For the rest, we used a single correlation matrix for each group of traits that shared the same or most of the SNPs.
+
+We ran our regression model for all 987 LVs across the 4,091 traits in PhenomeXcan.
+For replication, we ran the model in the 309 phecodes in eMERGE.
+We adjusted the $p$-values using the Benjamini-Hochberg procedure.
+
+
+### LV-based drug repurposing approach {#sec:methods:drug}
+
+For the drug-disease prediction, we derived an LV-based method based on a drug repositioning framework previously used for psychiatry traits [@doi:10.1038/nn.4618], where individual/single genes associated with a trait are anticorrelated with expression profiles for drugs.
+We compared our LV-based method with this previously published, single-gene approach.
+For the single-gene method, we computed a drug-disease score by multiplying each S-PrediXcan set of signed $z$-scores in tissue $t$, $\mathbf{M}^t$, with another set of signed $z$-scores from transcriptional responses profiled in LINCS L1000 [@doi:10.1016/j.cell.2017.10.049], $\mathbf{L}^{c \times m}$ (for $c$ compounds).
+Here $\mathbf{M}^t$ contains information about whether a higher or lower predicted expression of a gene is associated with disease risk, whereas $\mathbf{L}$ indicates whether a drug increases or decreases the expression of a gene.
+Therefore, these two matrices can be multiplied to compute a score for a drug-disease pair.
+The result of this product is $\mathbf{D}^{t,k}=-1 \cdot \mathbf{M}^{t,k} \mathbf{L}^\top$, where $k$ refers to the number of most significant gene associations in $\mathbf{M}^t$ for each trait.
+As suggested in [@doi:10.1038/nn.4618], $k$ could be either all genes or the top 50, 100, 250, and 500; then, we averaged score ranks across all $k$ and obtained $\mathbf{D}^t$.
+Finally, for each drug-disease pair, we took the maximum prediction score across all tissues: $\mathbf{D}_{ij} = \max \{ \mathbf{D}_{ij}^t \mid \forall t \}$.
+
+
+The same procedure was used for the LV-based approach, where we projected $\mathbf{M}^{t}$ and $\mathbf{L}$ into the gene module latent space using Equation (@eq:proj), leading to $\hat{\mathbf{M}}^t$ and $\hat{\mathbf{L}}^{l \times c}$, respectively.
+Finally, $\mathbf{D}^{t,k}=-1 \cdot \hat{\mathbf{L}}^{\top} \hat{\mathbf{M}}^{t,k}$, where in this case $k$ could be all LVs or the top 5, 10, 25 and 50 (since we have an order of magnitude less LVs than genes).
+
+
+Since the gold standard of drug-disease medical indications is described with Disease Ontology IDs (DOID) [@doi:10.1093/nar/gky1032], we mapped PhenomeXcan traits to the Experimental Factor Ontology [@doi:10.1093/bioinformatics/btq099] using [@url:https://github.com/EBISPOT/EFO-UKB-mappings], and then to DOID.
+
+
+### Consensus clustering of traits {#sec:methods:clustering}
+
+We performed two preprocessing steps on the S-MultiXcan results before the cluster analysis.
+First, we combined results in $\mathbf{M}$ (with $p$-values converted to $z$-scores, as described before) for traits that mapped to the same Experimental Factor Ontology (EFO) [@doi:10.1093/bioinformatics/btq099] term using the Stouffer's method: $\sum w_i M_{ij} / \sqrt{\sum w_i^2}$, where $w_i$ is a weight based on the GWAS sample size for trait $i$, and $M_{ij}$ is the $z$-score for gene $j$.
+Second, we divided all $z$-scores for each trait $i$ by their sum to reduce the effect of highly polygenic traits: $M_{ij} / \sum M_{ij}$.
+Finally, we projected this data matrix using Equation (@eq:proj), obtaining $\hat{\mathbf{M}}$ with $n$=3,752 traits and $l$=987 LVs as the input of our clustering pipeline.
+
+
+A partitioning of $\hat{\mathbf{M}}$ with $n$ traits into $k$ clusters is represented as a label vector $\pi \in \mathbb{N}^n$.
+Consensus clustering approaches consist of two steps:
+1) the generation of an ensemble $\Pi$ with $r$ partitions of the dataset: $\Pi=\{\pi_1, \pi_2, \ldots, \pi_r\}$,
+and 2) the combination of the ensemble into a consolidated solution defined as:
+
+$$
+\pi^* = \mathrm{arg}\,\underset{\hat{\pi}}{\max} Q(\{ \lvert \mathcal{L}^i \lvert \phi(\hat{\pi}_{\mathcal{L}^i}, \pi_{i \mathcal{L}^i}) \mid i \in \{1,\ldots,r\} \}),
+$$ {#eq:consensus:obj_func}
+
+where $\mathcal{L}^i$ is a set of data indices with known cluster labels for partition $i$,
+$\phi\colon \mathbb{N}^n \times \mathbb{N}^n \to \mathbb{R}$ is a function that measures the similarity between two partitions,
+and $Q$ is a measure of central tendency, such as the mean or median.
+We used the adjusted Rand index (ARI) [@doi:10.1007/BF01908075] for $\phi$ and the median for $Q$.
+To obtain $\pi^*$, we define a consensus function $\Gamma\colon \mathbb{N}^{n \times r} \to \mathbb{N}^n$ with $\Pi$ as the input.
+We used consensus functions based on the evidence accumulation clustering (EAC) paradigm [@doi:10.1109/TPAMI.2005.113], where $\Pi$ is first transformed into a distance matrix
+$\mathbf{D}_{ij} = d_{ij} / r$,
+where $d_{ij}$ is the number of times traits $i$ and $j$ were grouped in different clusters across all $r$ partitions in $\Pi$.
+Then, $\Gamma$ can be any similarity-based clustering algorithm, which is applied on $\mathbf{D}$ to derive the final partition $\pi^*$.
+
+
+For the ensemble generation step, we used different algorithms to create a highly diverse set of partitions (see Figure @fig:clustering:design) since diversity is an important property for ensembles [@doi:10.1016/j.ins.2016.04.027; @doi:10.1109/TPAMI.2011.84; @doi:10.1016/j.patcog.2014.04.005].
+We used three data representations: the raw dataset, its projection into the top 50 principal components, and the embedding learned by UMAP [@arxiv:1802.03426] using 50 components.
+For each of these, we applied five clustering algorithms covering a wide range of different assumptions on the data structure: $k$-means [@Arthur2007], spectral clustering [@Ng2001], a Gaussian mixture model (GMM), hierarchical clustering, and DBSCAN [@Ester1996].
+For $k$-means, spectral clustering and GMM, we specified a range of $k$ between 2 and $\sqrt{n} \approx 60$, and for each $k$ we generated five partitions using random seeds.
+For hierarchical clustering, for each $k$, we generated four partitions using common linkage criteria: ward, complete, average and single.
+For DBSCAN, we combined different ranges for parameters $\epsilon$ (the maximum distance between two data points to be considered part of the same neighborhood) and *minPts* (the minimum number of data points in a neighborhood for a data point to be considered a core point), based on the procedure in [@doi:10.1088/1755-1315/31/1/012012].
+Specifically, we used *minPts* values from 2 to 125.
+For each data representation (raw, PCA and UMAP), we determined a plausible range of $\epsilon$ values by observing the distribution of the mean distance of the *minPts*-nearest neighbors across all data points.
+Since some combinations of *minPts* and $\epsilon$ might not produce a meaningful partition (for instance, when all points are detected as noisy or only one cluster is found), we resampled partitions generated by DBSCAN to ensure an equal representation of this algorithm in the ensemble.
+This procedure generated a final ensemble of 4,428 partitions of 3,752 traits.
+
+
+Finally, we used spectral clustering on $\mathbf{D}$ to derive the final consensus partitions.
+$\mathbf{D}$ was first transformed into a similarity matrix by applying an RBF kernel $\mathrm{exp}(-\gamma \mathbf{D}^2)$ using four different values for $\gamma$ that we empirically determined to work best.
+Therefore, for each $k$ between 2 and 60, we derived four consensus partitions and selected the one that maximized Equation (@eq:consensus:obj_func).
+We further filtered this set of 59 solutions to keep only those with an ensemble agreement larger than the 75th percentile (Figure @fig:sup:clustering:agreement), leaving a total of 15 final consensus partitions shown in Figure @fig:clustering:tree.
+
+The input data in our clustering pipeline undergoes several linear and nonlinear transformations, including PCA, UMAP and the ensemble transformation using the EAC paradigm (distance matrix $\mathbf{D}$).
+Although consensus clustering has clear advantages for biological data [@pmid:27303057], this set of data transformations complicates the interpretation of results.
+To circumvent this, we used a supervised learning approach to detect which gene modules/LVs are the most important for each cluster of traits (Figure {@fig:clustering:design}b).
+Note that we did not use this supervised model for prediction but only to learn which features (LVs) were most discriminative for each cluster.
+For this, we used the highest resolution partition ($k$=29, although any could be used) to train a decision tree model using each of the clusters as labels and the projected data $\hat{\mathbf{M}}$ as the training samples.
+For each $k$, we built a set of binary labels with the current cluster's traits as the positive class and the rest of the traits as the negative class.
+Then, we selected the LV in the root node of the trained model only if its threshold was positive and larger than one standard deviation.
+Next, we removed this LV from $\hat{\mathbf{M}}$ (regardless of being previously selected or not) and trained the model again.
+We repeated this procedure 20 times to extract the top 20 LVs that better discriminate traits in a cluster from the rest.
+
+In [Supplementary Note 2](#sm:clustering:null_sim), we performed several analyses under a null hypothesis of no structure in the data to verify that the clustering results detected by this pipeline were real.
+
+
+### CRISPR-Cas9 screening {#sec:methods:crispr}
+
+**Cell culture.**
+HepG2 cells were obtained from ATCC (ATCC® HB-8065™), and maintained in Eagle's Minimum Essential Medium with L-Glutamine (EMEM, Cat. 112-018-101, Quality Biology) supplemented with 10% Fetal Bovine Serum (FBS, Gibco, Cat.16000-044), and 1% Pen/Strep (Gibco, Cat.15140-122).
+Cells were kept at 37oC in a humidity-controlled incubator with 5% CO2, and were maintained at a density not exceeding more than 80% confluency in Collagen-I coated flasks.
+
+**Genome-wide lentiviral pooled CRISPR-Cas9 library.**
+3rd lentiviral generation, Broad GPP genome-wide Human Brunello CRISPR knockout Pooled library was provided by David Root and John Doench from Addgene (Cat. 73179-LV), and was used for HepG2 cell transduction.
+It consists of 76,441 sgRNAs, and targets 19,114 genes in the human genome with an average of 4 sgRNAs per gene.
+Each 20nt sgRNA cassette was inserted into the lentiCRIS-PRv2 backbone between U6 promoter and gRNA scaffold.
+Through cell transduction, the lentiviral vectors which encode Cas9 were used to deliver the sgRNA cassette containing plasmids into cells during cell replication.
+Unsuccessful transduced cells were excluded through puromycin selection.
+
+**Lentiviral titer determination.**
+No-spin lentiviral transduction was utilized for the screen.
+In a Collagen-I coated 6-wells plate, approximate 2.5 M cells were seeded each well in the presence of 8ug/ml polybrene (Millipore Sigma, Cat. TR-1003 G), and a different titrated virus volume (e.g., 0, 50, 100, 200, 250, and 400ul) was assigned to each well.
+EMEM complete media was added to make the final volume of 1.24ml.
+16-18hrs post-transduction, virus/polybrene-containing media was removed from each well.
+Cells were washed twice with 1x DPBS and replaced with fresh EMEM.
+At 24h, cells in each well were trypsinized, diluted (e.g.,1:10), and seeded in pairs of wells of 6-well plates.
+At 60hr post-transduction, cell media in each well was replaced with fresh EMEM.
+2ug/ml of puromycin (Gibco, Cat. A1113803) was added to one well out of the pair.
+2-5 days after puromycin selection, or the 0 virus well treated with puromycin had no survival of cells, cells in both wells with/without puromycin were collected and counted for viability.
+Percentage of Infection (PI%) was obtained by comparing the cell numbers with/without puromycin selection within each pair.
+By means of Poisson's distribution theory, when transduction efficiency (PI%) is between 30-50%, which corresponds to an MOI (Multiplicity of Infection) of ~0.35-0.70.
+At MOI close to 0.3, around 25% of cells are infected, and the majority of those infected cells are predicted to have only one copy of the virus.
+Therefore, a volume of virus (120ul) yielding 30-40% of transduction efficiency was chosen for further large-scale viral transduction.
+
+**Lentiviral Transduction in HepG2 Using Brunello CRISPR Knockout Pooled Library.**
+In order to achieve a coverage (representation) of at least 500 cells per sgRNA, and at an MOI between 0.3-0.4 to ensure 95% of infected cells get only one viral particle per cell, ~200M cells were initiated for the screen.
+Transduction was carried out in a similar fashion as described above.
+Briefly, 2.5M cells were seeded in each well of 14 6-well plates, along with 8ug/ml of polybrene.
+A volume of 120ul of the virus was added to each experimental well.
+18hrs post-transduction, virus/PB mix medium was removed, and cells in each well were collected, counted, and pooled into T175 flasks.
+At 60hr post-transduction, 2ug/ml of puromycin was added to each flask.
+Mediums were changed every two days with fresh EMEM, topped with 2ug/ml puromycin.
+Seven days after puromycin selection, cells were collected, pooled, counted, and replated.
+
+**Fluorescent dye staining.**
+9 days after puromycin selection, cells were assigned to 2 groups.
+20-30M cells were collected as Unsorted Control.
+The cell pellet was spun down at 500 x g for 5min at 4oC.
+The dry pellet was kept at -80oC for further genomic DNA isolation.
+The rest of the cells (approximately 200M) were kept in 100mm dishes and stained with a fluorescent dye (LipidSpotTM 488, Biotium, Cat. 70065-T).
+In Brief, LipidSpot 488 was diluted to 1:100 with DPBS.
+4ml of staining solution was used for each dish and incubated at 37oC for 30min.
+Cell images were captured through fluorescent microscope EVOS for GFP signal detection (Figure @fig:sup:crispr:fig1).
+
+**Fluorescence-activated cell sorting (FACS).**
+Cells were immediately collected into 50ml tubes (From this point on, keep cells cold), and spun at 500 x g for 5min at 4oC.
+After DPBS wash, cell pellets were resuspended with FACS Sorting Buffer (1x DPBS without Ca2+/Mg2+, 2.5mM EDTA, 25mM HEPES, 1% BSA.
+The solution was filter sterilized, and kept at 4oC), with gentle pipetting to make single cells.
+The cell solution was then filtered through a cell strainer (Falcon, Cat. 352235) and was kept on ice, protected from light.
+Collected cells were sorted on FACSJazz.
+100um nozzle was used for sorting.
+~20% of each GFP-High and GFP-Low (Figure @fig:sup:crispr:fig2) were collected into 15ml tubes.
+After sorting, cells were immediately spun down.
+Pellets were kept at -80oC for further genomic DNA isolation.
+
+**Genomic DNA isolation and verification.**
+Three conditions of Genomic DNA (Un-Sorted Control, lentiV2 GFP-High, and lentiV2 GFP-Low) were extracted using QIAamp DNA Blood Mini Kit (Qiagen, Cat.51104), followed by UV Spectroscopy (Nanodrop) to access the quality and quantity of the gDNA.
+A total of 80-160ug of gDNA was isolated for each condition.
+sgRNA cassette and lentiviral specific transgene in isolated gDNA were verified through PCR (Figure @fig:sup:crispr:fig3).
+
+**Illumina libraries generation and sequencing.**
+The fragment containing sgRNA cassette was amplified using P5 /P7 primers, as indicated in [@pmid:26780180], and primer sequences were adapted from Broad Institute protocol (Figure @fig:sup:crispr:table1).
+Stagger sequence (0-8nt) was included in P5 and 8bp uniquely barcoded sequence in P7.
+Primers were synthesized through Integrated DNA Technologies (IDT), and each primer was PAGE purified.
+32 PCR reactions were set up for each condition.
+Each 100ul PCR reaction consists of roughly 5ug of gDNA, 5ul of each 10uM P5 and P7.
+ExTaq DNA Polymerase (TaKaRa, Cat. RR001A) was used to amplify the amplicon.
+PCR Thermal Cycler Parameters set as Initial at 95oC for 1min; followed by 24 cycles of Denaturation at 94oC for 30 seconds, Annealing at 52.5oC for 30 seconds, Extension at 72oC for 30 seconds.
+A final Elongation at 72oC for 10 minutes.
+285bp-293bp PCR products were expected (Figure @fig:sup:crispr:fig4 A).
+PCR products within the same condition were pooled and purified using SPRIselect beads (Beckman Coulter, Cat. B23318).
+Purified Illumina libraries were quantitated on Qubit, and the quality of the library was analyzed on Bio-analyzer using High Sensitivity DNA Chip.
+A single approximate 285bp peak was expected (Figure @fig:sup:crispr:fig4 B).
+Final Illumina library samples were sequenced on Nova-seq 6000.
+Samples were pooled and loaded on an SP flow cell, along with a 20% PhiX control v3 library spike-in.
+
+
+## Data availability
+
+All the main datasets generated in this study are available at [https://doi.org/10.5281/zenodo.8071382](https://doi.org/10.5281/zenodo.8071382) [@doi:10.5281/zenodo.8071382] and the GitHub repository [https://github.com/greenelab/phenoplier](https://github.com/greenelab/phenoplier).
+
+The main input datasets used are TWAS from PhenomeXcan [@doi:10.1126/sciadv.aba2083] for 4,091 traits and from the Electronic Medical Records and Genomics (eMERGE) network phase III [@doi:10.1101/2021.10.21.21265225] for 309 traits;
+transcriptional responses to small molecule perturbations from LINCS L1000 [@doi:10.1016/j.cell.2017.10.049] that were further preprocessed and mapped to DrugBank IDs from [@doi:10.5281/zenodo.47223];
+latent space/gene module models from MultiPLIER [@doi:10.1016/j.cels.2019.04.003].
+
+The data used from PhenomeXcan, LINCS L1000, and MultiPLIER are publicly available.
+All significant results reported for the eMERGE and Penn Medicine BioBank (PMBB) phenome-wide TWAS are contained in [@doi:10.1101/2021.10.21.21265225].
+The individual-level PMBB raw datasets can not be made publicly available due to institutional privacy policy.
+Please contact Penn Medicine Biobank ([https://pmbb.med.upenn.edu/pmbb/](https://pmbb.med.upenn.edu/pmbb/)) for requests of access to data.
+eMERGE network phase III data is available on dbGAP (Accession: phs001584.v2.p2).
+
+
+## Code availability
+
+The code necessary to reproduce all the analyses in this work is available at [https://doi.org/10.5281/zenodo.8071382](https://doi.org/10.5281/zenodo.8071382) [@doi:10.5281/zenodo.8071382] and the GitHub repository [https://github.com/greenelab/phenoplier](https://github.com/greenelab/phenoplier).
+
+For the CRISPR screening, we used FlowJo v10.7 and FACS Jazz Software v1.1.
+For data analysis, we used Python 3.8 and R 3.6 with several computational packages.
+The main Python packages used were: Jupyter Lab (2.2), pandas (1.1), matplotlib (3.3), seaborn (0.11), numpy (1.19), scipy (1.5), scikit-learn (0.23), and umap-learn (0.4).
+The main R packages were: Bioconductor (3.10), clusterProfiler (3.14), clustree (0.4), and fgsea (1.17).
+We also developed several scripts and notebooks which are published under an open-source license.
+We documented all the steps necessary to carry out all the analyses.
+We also provide a Docker image to use the same runtime environment we used, and a demo to quickly test the methods on real data.
diff --git a/tests/manuscripts/phenoplier_full/10.references.md b/tests/manuscripts/phenoplier_full/10.references.md
new file mode 100644
index 0000000..339b33c
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/10.references.md
@@ -0,0 +1,4 @@
+## References {.page_break_before}
+
+<!-- Explicitly insert bibliography here -->
+<div id="refs"></div>
diff --git a/tests/manuscripts/phenoplier_full/15.acknowledgements.md b/tests/manuscripts/phenoplier_full/15.acknowledgements.md
new file mode 100644
index 0000000..1c26c0e
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/15.acknowledgements.md
@@ -0,0 +1,48 @@
+## Acknowledgements
+
+This study was funded by:
+the Gordon and Betty Moore Foundation (GBMF 4552 to C.S. Greene; GBMF 4560 to B.D. Sullivan),
+the National Human Genome Research Institute (R01 HG010067 to C.S. Greene, S.F.A. Grant and B.D. Sullivan; K99 HG011898 and R00 HG011898 to M. Pividori; U01 HG011181 to W. Wei),
+the National Cancer Institute (R01 CA237170 to C.S. Greene),
+the Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765 to C.S. Greene),
+the National Institute of Aging (R01AG069900 to W. Wei),
+the National Institute of General Medical Sciences (R01 GM139891 to W. Wei);
+the National Heart, Lung, and Blood Institute (R01 HL163854 to Q. Feng);
+the National Institute of Diabetes and Digestive and Kidney Diseases (DK126194 to B.F. Voight);
+the Daniel B. Burke Endowed Chair for Diabetes Research to S.F.A. Grant;
+the Robert L. McNeil Jr. Endowed Fellowship in Translational Medicine and Therapeutics to C. Skarke.
+
+The Phase III of the eMERGE Network was initiated and funded by the NHGRI through the following grants:
+U01 HG8657 (Group Health Cooperative/University of Washington);
+U01 HG8685 (Brigham and Womens Hospital);
+U01 HG8672 (Vanderbilt University Medical Center);
+U01 HG8666 (Cincinnati Childrens Hospital Medical Center);
+U01 HG6379 (Mayo Clinic);
+U01 HG8679 (Geisinger Clinic);
+U01 HG8680 (Columbia University Health Sciences);
+U01 HG8684 (Childrens Hospital of Philadelphia);
+U01 HG8673 (Northwestern University);
+U01 HG8701 (Vanderbilt University Medical Center serving as the Coordinating Center);
+U01 HG8676 (Partners Healthcare/Broad Institute);
+and U01 HG8664 (Baylor College of Medicine).
+
+The Penn Medicine BioBank (PMBB) is funded by the Perelman School of Medicine at the University of Pennsylvania, a gift from the Smilow family, and the National Center for Advancing Translational Sciences of the National Institutes of Health under CTSA Award Number UL1TR001878.
+We thank D. Birtwell, H. Williams, P. Baumann and M. Risman for informatics support regarding the PMBB.
+We thank the staff of the Regeneron Genetics Center for whole-exome sequencing of DNA from PMBB participants.
+
+Figure {@fig:entire_process}a was created with BioRender.com.
+
+
+## Author contributions statement
+
+M. Pividori and C.S. Greene conceived and designed the study.
+M. Pividori designed the computational methods, performed the experiments, analyzed the data, interpreted the results, and drafted the manuscript.
+C.S. Greene supervised the entire project and provided critical guidance throughout the study.
+S. Lu, C. Su, and M.E. Johnson performed the CRISPR screen with the supervision of S.F.A. Grant.
+B. Li provided the TWAS results for eMERGE for replication, and this analysis was supervised by M.D. Ritchie.
+W. Wei, Q. Feng, B. Namjou, K. Kiryluk, I. Kullo, Y. Luo, and M.D. Ritchie, as part of the eMERGE consortium, provided critical feedback regarding the analyses of this data.
+All authors revised the manuscript and provided critical feedback.
+
+## Competing interests statement
+
+The authors declare no competing interests.
diff --git a/tests/manuscripts/phenoplier_full/50.00.supplementary_material.md b/tests/manuscripts/phenoplier_full/50.00.supplementary_material.md
new file mode 100644
index 0000000..918faf1
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/50.00.supplementary_material.md
@@ -0,0 +1,743 @@
+\clearpage
+
+## Supplementary information {.page_break_before}
+
+### Supplementary Note 1: mean type I error rates and calibration of LV-based regression model {#sm:reg:null_sim}
+
+We assessed our GLS model type I error rates (proportion of $p$-values below 0.05) and calibration using a null model of random traits and genotype data from 1000 Genomes Phase III.
+We selected 312 individuals with European ancestry, and then analyzed 1,000 traits drawn from a standard normal distribution $\mathcal{N}(0,1)$.
+We ran all the standard procedures for the TWAS approaches (S-PrediXcan and S-MultiXcan), including:
+1) a standard GWAS using linear regression under an additive genetic model,
+2) different GWAS processing steps, including harmonization and imputation procedures as defined in [@doi:10.1002/gepi.22346],
+3) S-PrediXcan and S-MultiXcan analyses.
+Below we provide details for each of these steps.
+
+**Step 1 - GWAS**. We performed standard QC procedures such as
+filtering out variants with missing call rates eexceeding 0.01, MAF below 1% or MAC below 20, and HWE below 1e-6,
+and removing samples with high sex-discrepancy and high-relatedness (first and second degree).
+We included sex and the top 20 principal components as covariates, performing the association test on 5,923,554 variants across all 1,000 random phenotypes.
+
+**Step 2 - GWAS processing**. These steps include harmonization of GWAS and imputation of $z$-scores, which are part of the TWAS pipeline and are needed in order to ensure an acceptable overlap with SNPs in prediction models.
+The scripts to run these steps are available in [@url:https://github.com/hakyimlab/summary-gwas-imputation].
+These procedures were run for all 1,000 random phenotypes and generated a total number of 8,325,729 variants, including those with original and imputed $z$-scores.
+
+**Step 3 - TWAS**. We processed the imputed GWAS with S-PrediXcan using the MASHR prediction models on 49 tissues from GTEx v8.
+Then, S-MultiXcan was ran using the GWAS and S-PrediXcan outputs to generate gene-trait association $p$-values.
+
+Finally, we ran our GLS model (Equation (@eq:reg:model)) to compute an association between each of the 987 LVs in MultiPLIER and the 1,000 S-MultiXcan results on random phenotypes.
+For this, we built a gene correlation matrix specifically for this cohort (see [Methods](#sec:methods:reg)).
+Then, we compared the GLS results with an equivalent, baseline ordinarly least squares (OLS) model assuming independence between genes.
+Figure @fig:reg:nulls:qqplots compares the distribution of $p$-values of the OLS and GLS models.
+The GLS model has a slightly smaller mean type I error rate (0.0558, SD=0.0127) than the baseline OLS model (0.0584, SD=0.0140), and $p$-values follow more closely the expected uniform distribution.
+Importantly, the GLS model is able to correct for LVs with adjacent and highly correlated genes at the top such as LV234 (Figure @fig:reg:nulls:qqplot:lv234), LV847 (Figure @fig:reg:nulls:qqplot:lv847), LV45 (Figure @fig:reg:nulls:qqplot:lv45), or LV800 (Figure @fig:reg:nulls:qqplot:lv800), among others.
+In contrast and as expected, the OLS model has higher mean type I errors and smaller-than-expected $p$-values in all these cases.
+
+We also detected other LVs with higher-than-expected mean type I errors for both the GLS and OLS models, although they don't have a relatively large number of adjacent genes at the top.
+One example is LV914, shown in Figure @fig:reg:nulls:qqplot:lv914.
+Inflation in these LVs might be explained by inaccuracies in correlation estimates between the individual-level MultiXcan model and its summary-based version (see Methods).
+Therefore, we flagged those with a type I error rate larger than 0.07 (127 LVs) and excluded them from our main analyses.
+We didn't see signs of inflation when applying the method in real data (Figure @fig:reg:real:qqplots).
+
+![
+**QQ-plots for OLS (baseline) and GLS (PhenoPLIER) models on random phenotypes.**
+](images/gls/null_sims/models_qqplots.png "QQ-plots for OLS and GLS models"){#fig:reg:nulls:qqplots tag="S1" width="80%"}
+
+![
+**QQ-plots for LV234 on random phenotypes.**
+Among the top 1% of genes in this LV, 17 are located in band 6p22.2, 5 in 6p22.1 and 3 in 7q11.23.
+](images/gls/null_sims/models_lv234.png "QQ-plots for LV234"){#fig:reg:nulls:qqplot:lv234 tag="S2" width="80%"}
+
+![
+**QQ-plots for LV847 on random phenotypes.**
+Among the top 1% of genes in this LV, 15 are located in band 6p22.2, 5 in 6p22.1 and 2 in 15q26.1.
+](images/gls/null_sims/models_lv847.png "QQ-plots for LV847"){#fig:reg:nulls:qqplot:lv847 tag="S3" width="80%"}
+
+![
+**QQ-plots for LV45 on random phenotypes.**
+Among the top 1% of genes in this LV, 12 are located in band 6p22.2, 6 in 6p22.1 and 3 in 1q23.3.
+](images/gls/null_sims/models_lv45.png "QQ-plots for LV45"){#fig:reg:nulls:qqplot:lv45 tag="S4" width="80%"}
+
+![
+**QQ-plots for LV800 on random phenotypes.**
+Among the top 1% of genes in this LV, 16 are located in band 19q13.43, 9 in 19p13.2 and 9 in 19q13.31.
+](images/gls/null_sims/models_lv800.png "QQ-plots for LV800"){#fig:reg:nulls:qqplot:lv800 tag="S5" width="80%"}
+
+![
+**QQ-plots for LV914 on random phenotypes.**
+Among the top 1% of genes in this LV, 2 are located in band 13q13.3, 2 in 7p15.2 and 2 in 19q13.2.
+](images/gls/null_sims/models_lv914.png "QQ-plots for LV914"){#fig:reg:nulls:qqplot:lv914 tag="S6" width="80%"}
+
+![
+**QQ-plots of LV-trait associations in real data.**
+QQ-plot in PhenomeXcan (left, discovery cohort) across 4,091 traits and 987 LVs, and eMERGE (right, replication cohort) across 309 traits and 987 LVs.
+](images/gls/real_data/qqplots.png "QQ-plots in real data"){#fig:reg:real:qqplots tag="S7" width="80%"}
+
+
+\clearpage
+
+### CRISPR-Cas9
+
+![
+**EVOS Fluorescence Microscope Image Capture.**
+A. HepG2_lentiV2_Ctrl with no-viral transduction.
+B. HepG2_lentiV2 with viral transduction.
+Both no-viral transduction Control (A) and lentiviral transduction (B) HepG2 cells were stained with LipidSpot™488.
+The CRISPR screening process was performed once, but we conducted two selections (high and low fluorescence) with a control of no/before selection. Subsequently, we generated three technical replicates for the DNA-seq libraries under each condition. In order to mitigate false positives resulting from the single-screen process, we overlapped the candidates from multiple pairwise differential analyses and selected the genes that were consistent between selections.
+](images/crispr/figure1.png "EVOS Fluorescence Microscope Image Capture"){#fig:sup:crispr:fig1 tag="S8" width="80%"}
+
+
+![
+**Fluorescence-Activated Cell Sorting Gate Setting.**
+A. HepG2_UnStained WT.
+B. HepG2_lentiV2 with viral transduction stained with LipidSpot™488.
+HepG2_lentiV2 cells were FAC sorted, 20% of GFP-High and 20% of GFP-Low cell populations were collected.
+](images/crispr/figure2.png "Fluorescence-Activated Cell Sorting Gate Setting"){#fig:sup:crispr:fig2 tag="S9" width="80%"}
+
+
+![
+**Verification of sgRNA cassette and lentiV2 transgene.**
+A. 20nt sgRNA cassette was verified in lentiV2 transduced genomic DNA population, 163 bp PCR product obtained, while WT HepG2 didn’t possess the cassette, thus, no PCR product.
+B. lentiviral-specific transgene WPRE was verified in lentiV2 transduced genomic DNA population, while no transduced WT didn’t have the transgene, therefore, no 173 bp PCR product observed.
+For both panels A and B, 100 bp ladder was used in Lane 0.
+The CRISPR screening process was performed once, but we conducted two selections (high and low fluorescence) with a control of no/before selection. Subsequently, we generated three technical replicates for the DNA-seq libraries under each condition. In order to mitigate false positives resulting from the single-screen process, we overlapped the candidates from multiple pairwise differential analyses and selected the genes that were consistent between selections.
+](images/crispr/figure3.png "Verification of sgRNA cassette and lentiV2 transgene"){#fig:sup:crispr:fig3 tag="S10" width="80%"}
+
+
+![
+**Primers for generating illumina libraries.**
+Sequences are provided in a readable format in Supplementary Data 8.
+](images/crispr/table1.png "Primers for Generating Illumina Libraries"){#fig:sup:crispr:table1 tag="S11" width="80%"}
+
+
+![
+**Illumina library generation.**
+A. Construct for generating Illumina libraries.
+B. Final Illumina library from HS DNA ---showed a single ~285bp peak was generated.
+The CRISPR screening process was performed once, but we conducted two selections (high and low fluorescence) with a control of no/before selection. Subsequently, we generated three technical replicates for the DNA-seq libraries under each condition. In order to mitigate false positives resulting from the single-screen process, we overlapped the candidates from multiple pairwise differential analyses and selected the genes that were consistent between selections.
+](images/crispr/figure4.png "Illumina library generation"){#fig:sup:crispr:fig4 tag="S12" width="80%"}
+
+
+
+\clearpage
+
+### Gene modules enrichment for lipids gene-sets
+
+<!-- lipids_gene_sets:modules_enriched_increase:start -->
+| Gene module   | Lipids gene-set   | Leading edge      | p-value   |
+|:--------------|:------------------|:------------------|:----------|
+| **LV246**     | increase          | *DGAT2*, *ACACA*  | 0.0035    |
+| LV702         | increase          | *ACACA*, *DGAT2*  | 0.0046    |
+| **LV607**     | increase          | *ACACA*, *DGAT2*  | 0.0058    |
+| LV890         | increase          | *ACACA*, *DGAT2*  | 0.0067    |
+| **LV74**      | increase          | *MBTPS1*, *DGAT2* | 0.0078    |
+| **LV865**     | increase          | *ACACA*, *DGAT2*  | 0.0092    |
+| LV841         | increase          | *ACACA*, *DGAT2*  | 0.0096    |
+
+Table: Gene modules (LVs) nominally enriched (using FGSEA [@doi:10.1101/060012]) for the lipids-increasing gene-set from the CRISPR-screen (unadjusted $p$-values < 0.01).
+LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {#tbl:sup:lipids_crispr:modules_enriched_increase tag="S1"}
+<!-- lipids_gene_sets:modules_enriched_increase:end -->
+
+
+<!-- lipids_gene_sets:modules_enriched_decrease:start -->
+| Gene module   | Lipids gene-set   | Leading edge       | p-value   |
+|:--------------|:------------------|:-------------------|:----------|
+| LV520         | decrease          | *FBXW7*, *TCF7L2*  | 0.0006    |
+| LV801         | decrease          | *UBE2J2*, *TCF7L2* | 0.0022    |
+| LV512         | decrease          | *FBXW7*, *TCF7L2*  | 0.0025    |
+| **LV612**     | decrease          | *PTEN*, *FBXW7*    | 0.0036    |
+| LV41          | decrease          | *PCYT2*, *TCF7L2*  | 0.0041    |
+| **LV838**     | decrease          | *UBE2J2*, *TCF7L2* | 0.0070    |
+| LV302         | decrease          | *TCF7L2*, *PTEN*   | 0.0083    |
+| LV959         | decrease          | *TCF7L2*, *PTEN*   | 0.0092    |
+
+Table: Gene modules (LVs) nominally enriched (using FGSEA [@doi:10.1101/060012]) for the lipids-decreasing gene-set from the CRISPR-screen (unadjusted $p$-values < 0.01).
+LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {#tbl:sup:lipids_crispr:modules_enriched_decrease tag="S2"}
+<!-- lipids_gene_sets:modules_enriched_decrease:end -->
+
+
+\clearpage
+
+### Supplementary Note 2: Cluster analyses under the null hypothesis of no structure in the data {#sm:clustering:null_sim}
+
+For our clustering pipeline, we simulated different escenarios where there is no structure in the input data matrix $\hat{\mathbf{M}}$ (gene-trait associations from PhenomeXcan projected into the latent gene expression representation).
+For this, we simulated two cases where any groupings of traits are removed:
+1) the gene-trait association matrix $\mathbf{M}$ (from S-MultiXcan) does not have any meaningful structure to find groups of traits, while preserving the latent variables in $\mathbf{Z}$ from the MultiPLIER models;
+and 2) the latent variables in matrix $\mathbf{Z}$ does not have any meaningful structure to find groups of traits, while preserving the gene-trait association matrix $\mathbf{M}$.
+
+For the first scenario, we shuffled genes in $\mathbf{M}$ for each trait, and this randomized matrix was then projected into the latent space.
+For the second scenario, we projected matrix $\mathbf{M}$ into the latent space, and then shuffled LVs in $\hat{\mathbf{M}}$ for each trait.
+For each of these scenarios, we ran exactly the same clustering pipeline we used for the real data ([Methods](#sec:methods:clustering)), generating an ensemble of partitions that was later combined using the same consensus functions to derive the final partitions of traits.
+Finally, we computed
+1) stability statistics on the ensemble partitions from different algorithms
+and 2) the agreement of the final consensus partition with the ensemble.
+
+
+![
+**Agreement of consensus partitions with ensemble.**
+A real and two simulated scenarios with no data structure are shown.
+For each scenario, one final consensus partition was derived for each $k$ from 2 to 60 ($x$-axis) following our clustering pipeline.
+For each partition, the agreement with the corresponding ensemble was computed using the ARI ($y$-axis).
+For the real data scenario, partitions with an agreement above the 75th percentile (dashed line) were selected for follow-up analyses in the main text.
+](images/clustering/selected_best_partitions_by_k.svg "Agreement of consensus partitions with ensemble"){#fig:sup:clustering:agreement tag="S13" width="100%"}
+
+
+The results of this analysis (Figure @fig:sup:clustering:agreement) show that, under the two simulated null scenarios, the agreement of the consensus partitions with the ensemble is very close to zero.
+This means, as expected, that there is no consensus among ensemble partitions generated with different clustering algorithms and data representations.
+In contrast, using the real data, the consensus clustering approach finds trait pairs that are grouped together across the different members of the ensemble.
+The partitions above the 75th percentile were considered in the main analyses, and are shown in the clustering tree in Figure @fig:clustering:tree.
+
+
+\clearpage
+
+### Cluster-specific and general transcriptional processes associated with disease
+
+![
+**Cluster-specific and general transcriptional processes associated with disease using novel LVs.**
+The plot shows a submatrix of $\hat{\mathbf{M}}$ for the main trait clusters at $k$=29, considering only LVs (rows) that are not aligned with any pathway.
+Standardized values from -6 (lighter color) to 21 (darker color).
+](images/clustering/global_clustermap-novel-plain.svg "Heatmap with novel gene modules and traits"){#fig:sup:clustering:novel:heatmap tag="S14" width="100%"}
+
+
+\clearpage
+
+### Latent variables (gene modules) information
+
+#### LV603
+
+<!-- LV603:multiplier_pathways:start -->
+| Pathway                             | AUC   | FDR      |
+|:------------------------------------|:------|:---------|
+| IRIS Neutrophil-Resting             | 0.91  | 4.51e-35 |
+| SVM Neutrophils                     | 0.98  | 1.43e-09 |
+| PID IL8CXCR2 PATHWAY                | 0.81  | 7.04e-03 |
+| SIG PIP3 SIGNALING IN B LYMPHOCYTES | 0.77  | 1.95e-02 |
+
+Table: Pathways aligned to LV603 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv603 tag="S3"}
+<!-- LV603:multiplier_pathways:end -->
+
+<!-- LV603:phenomexcan_traits_assocs:start -->
+| Trait description                         | Sample size   | Cases   | FDR            |
+|:------------------------------------------|:--------------|:--------|:---------------|
+| Basophill percentage                      | 349,861       |         | 1.19e&#8209;10 |
+| Basophill count                           | 349,856       |         | 1.89e&#8209;05 |
+| Treatment/medication code: ispaghula husk | 361,141       | 327     | 1.36e&#8209;02 |
+
+Table: Significant trait associations of LV603 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv603 tag="S4"}
+<!-- LV603:phenomexcan_traits_assocs:end -->
+
+<!-- LV603:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV603 in eMERGE. {#tbl:sup:emerge_assocs:lv603 tag="S5"}
+<!-- LV603:emerge_traits_assocs:end -->
+
+\clearpage
+
+#### LV246
+
+<!-- LV246:multiplier_pathways:start -->
+| Pathway                                                        | AUC   | FDR      |
+|:---------------------------------------------------------------|:------|:---------|
+| REACTOME FATTY ACID TRIACYLGLYCEROL AND KETONE BODY METABOLISM | 0.89  | 3.97e-16 |
+| REACTOME METABOLISM OF LIPIDS AND LIPOPROTEINS                 | 0.67  | 1.14e-08 |
+| REACTOME TRIGLYCERIDE BIOSYNTHESIS                             | 0.86  | 6.52e-04 |
+| KEGG PYRUVATE METABOLISM                                       | 0.82  | 2.66e-03 |
+| KEGG PROPANOATE METABOLISM                                     | 0.83  | 4.27e-03 |
+
+Table: Pathways aligned to LV246 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv246 tag="S6"}
+<!-- LV246:multiplier_pathways:end -->
+
+<!-- LV246:phenomexcan_traits_assocs:start -->
+| Trait description                                                                                       | Sample size   | Cases         | FDR            |
+|:--------------------------------------------------------------------------------------------------------|:--------------|:--------------|:---------------|
+| Triglycerides NMR                                                                          | 21,559        |         | 1.66e&#8209;26 |
+| LDL Cholesterol NMR                                                                        | 13,527        |         | 3.92e&#8209;26 |
+| High cholesterol (self-reported)                                                           | 361,141       | 43,957  | 1.08e&#8209;24 |
+| Cholesterol lowering medication                                                            | 193,148       | 24,247  | 4.28e&#8209;24 |
+| Treatment/medication code: simvastatin                                                     | 361,141       | 40,921  | 2.56e&#8209;19 |
+| CH2DB NMR                                                                                  | 24,154        |         | 1.05e&#8209;15 |
+| Cholesterol lowering medication                                                            | 165,340       | 38,057  | 9.58e&#8209;15 |
+| Treatment/medication code: atorvastatin                                                    | 361,141       | 10,805  | 2.54e&#8209;14 |
+| Illnesses of mother: Alzheimer's disease/dementia                                          | 331,041       | 28,507  | 2.76e&#8209;08 |
+| Illnesses of father: Alzheimer's disease/dementia                                          | 312,666       | 15,022  | 2.76e&#8209;08 |
+| Alzheimers Disease                                                                         | 54,162        | 17,008  | 1.10e&#8209;07 |
+| Non-butter spread type details: Flora Pro-Active or Benecol                                | 190,094       | 29,048  | 5.63e&#8209;07 |
+| Illnesses of siblings: Alzheimer's disease/dementia                                        | 279,062       | 1,609   | 6.16e&#8209;07 |
+| Any dementia                                                                               | 361,194       | 243     | 2.86e&#8209;05 |
+| Illnesses of father: None of the above (group 1)                                           | 314,797       | 116,736 | 3.56e&#8209;05 |
+| Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones (females) | 193,148       | 133,338 | 1.10e&#8209;04 |
+| Treatment/medication code: lipitor 10mg tablet                                             | 361,141       | 2,584   | 1.55e&#8209;04 |
+| Treatment/medication code: rosuvastatin                                                    | 361,141       | 2,227   | 1.37e&#8209;03 |
+| Illnesses of father: Heart disease                                                         | 318,570       | 104,110 | 1.89e&#8209;03 |
+| Dementia                                                                                   | 361,194       | 157     | 9.58e&#8209;03 |
+| Mother still alive                                                                         | 355,029       | 140,246 | 1.76e&#8209;02 |
+| Job SOC coding: Librarians                                                                 | 91,149        | 1,248   | 3.22e&#8209;02 |
+| Alzheimer’s disease                                                                        | 361,194       | 119     | 3.61e&#8209;02 |
+
+Table: Significant trait associations of LV246 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv246 tag="S7"}
+<!-- LV246:phenomexcan_traits_assocs:end -->
+
+<!-- LV246:emerge_traits_assocs:start -->
+| Phecode   | Trait description              | Sample size   | Cases   | FDR            |
+|:----------|:-------------------------------|:--------------|:--------|:---------------|
+| 272.11    | Hypercholesterolemia           | 40,786        | 14,138  | 4.40e&#8209;09 |
+| 272.1     | Hyperlipidemia                 | 55,843        | 29,195  | 3.57e&#8209;07 |
+| 272       | Disorders of lipoid metabolism | 55,892        | 29,244  | 3.79e&#8209;07 |
+| 292.3     | Memory loss                    | 48,785        | 2,094   | 1.80e&#8209;02 |
+
+Table: Significant trait associations of LV246 in eMERGE. {#tbl:sup:emerge_assocs:lv246 tag="S8"}
+<!-- LV246:emerge_traits_assocs:end -->
+
+\clearpage
+
+#### LV116
+
+<!-- LV116:multiplier_pathways:start -->
+| Pathway                                              | AUC   | FDR      |
+|:-----------------------------------------------------|:------|:---------|
+| REACTOME INTERFERON SIGNALING                        | 0.84  | 3.48e-09 |
+| SVM Macrophages M1                                   | 0.92  | 2.09e-05 |
+| REACTOME INTERFERON ALPHA BETA SIGNALING             | 0.94  | 3.36e-05 |
+| REACTOME CYTOKINE SIGNALING IN IMMUNE SYSTEM         | 0.67  | 1.53e-04 |
+| IRIS DendriticCell-LPSstimulated                     | 0.65  | 1.09e-03 |
+| KEGG CYTOSOLIC DNA SENSING PATHWAY                   | 0.84  | 3.22e-03 |
+| REACTOME NEGATIVE REGULATORS OF RIG I MDA5 SIGNALING | 0.81  | 1.61e-02 |
+
+Table: Pathways aligned to LV116 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv116 tag="S9"}
+<!-- LV116:multiplier_pathways:end -->
+
+\clearpage
+
+#### LV931
+
+
+<!-- LV931:multiplier_pathways:start -->
+| Pathway          | AUC   | FDR      |
+|:-----------------|:------|:---------|
+| MIPS SPLICEOSOME | 0.63  | 3.13e-02 |
+| PID TGFBRPATHWAY | 0.71  | 3.99e-02 |
+
+Table: Pathways aligned to LV931 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv931 tag="S10"}
+<!-- LV931:multiplier_pathways:end -->
+
+![
+**Cell types for LV931.**
+<!--  -->
+](images/lvs_analysis/lv931/lv931-cell_types.svg "Cell types for LV931"){#fig:sup:lv931 tag="S15" width="80%"}
+
+\clearpage
+
+#### LV66
+
+<!-- LV66:multiplier_pathways:start -->
+| Pathway                                        | AUC   | FDR      |
+|:-----------------------------------------------|:------|:---------|
+| REACTOME METABOLISM OF LIPIDS AND LIPOPROTEINS | 0.62  | 3.12e-04 |
+
+Table: Pathways aligned to LV66 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv66 tag="S11"}
+<!-- LV66:multiplier_pathways:end -->
+
+![
+**Cell types for LV66.**
+<!--  -->
+](images/lvs_analysis/lv66/lv66-cell_types.svg "Cell types for LV66"){#fig:sup:lv66 tag="S16" width="80%"}
+
+\clearpage
+
+#### LV928
+
+<!-- LV928:multiplier_pathways:start -->
+| Pathway   | AUC   | FDR      |
+|:----------|:------|:---------|
+| DMAP ERY3 | 0.81  | 1.16e-24 |
+| DMAP ERY4 | 0.78  | 2.49e-17 |
+
+Table: Pathways aligned to LV928 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv928 tag="S12"}
+<!-- LV928:multiplier_pathways:end -->
+
+<!-- LV928:phenomexcan_traits_assocs:start -->
+| Trait description                               | Sample size   | Cases   | FDR            |
+|:------------------------------------------------|:--------------|:--------|:---------------|
+| Mean sphered cell volume                        | 344,729       |         | 1.60e&#8209;20 |
+| Mean corpuscular haemoglobin concentration      | 350,468       |         | 1.42e&#8209;17 |
+| Mean reticulocyte volume                        | 344,728       |         | 1.77e&#8209;17 |
+| Reticulocyte count                              | 344,729       |         | 2.28e&#8209;10 |
+| Reticulocyte percentage                         | 344,728       |         | 1.37e&#8209;09 |
+| Red blood cell (erythrocyte) distribution width | 350,473       |         | 2.90e&#8209;09 |
+| Reticulocyte Count                              | 173,480       |         | 1.09e&#8209;07 |
+| Mean corpuscular volume                         | 350,473       |         | 1.46e&#8209;03 |
+| High light scatter reticulocyte count           | 344,729       |         | 3.49e&#8209;03 |
+| Age at first episode of depression              | 61,033        |         | 1.33e&#8209;02 |
+| High Light Scatter Reticulocyte Count           | 173,480       |         | 1.48e&#8209;02 |
+| Mean corpuscular haemoglobin                    | 350,472       |         | 4.02e&#8209;02 |
+
+Table: Significant trait associations of LV928 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv928 tag="S13"}
+<!-- LV928:phenomexcan_traits_assocs:end -->
+
+<!-- LV928:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV928 in eMERGE. {#tbl:sup:emerge_assocs:lv928 tag="S14"}
+<!-- LV928:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV928.**
+<!--  -->
+](images/lvs_analysis/lv928/lv928-cell_types.svg "Cell types for LV928"){#fig:sup:lv928 tag="S17" width="80%"}
+
+\clearpage
+
+#### LV30
+
+<!-- LV30:multiplier_pathways:start -->
+| Pathway   | AUC   | FDR      |
+|:----------|:------|:---------|
+| DMAP ERY3 | 0.95  | 5.62e-52 |
+| DMAP ERY4 | 0.98  | 5.28e-51 |
+| DMAP ERY5 | 0.98  | 1.96e-49 |
+
+Table: Pathways aligned to LV30 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv30 tag="S15"}
+<!-- LV30:multiplier_pathways:end -->
+
+<!-- LV30:phenomexcan_traits_assocs:start -->
+| Trait description                                         | Sample size   | Cases   | FDR            |
+|:----------------------------------------------------------|:--------------|:--------|:---------------|
+| Mean reticulocyte volume                                  | 344,728       |         | 1.09e&#8209;32 |
+| Mean sphered cell volume                                  | 344,729       |         | 1.38e&#8209;24 |
+| Reticulocyte Count                                        | 173,480       |         | 6.28e&#8209;18 |
+| Reticulocyte percentage                                   | 344,728       |         | 1.27e&#8209;17 |
+| Mean corpuscular haemoglobin concentration                | 350,468       |         | 1.62e&#8209;17 |
+| Reticulocyte count                                        | 344,729       |         | 1.62e&#8209;17 |
+| High light scatter reticulocyte count                     | 344,729       |         | 4.78e&#8209;11 |
+| High Light Scatter Reticulocyte Count                     | 173,480       |         | 8.49e&#8209;11 |
+| Immature reticulocyte fraction                            | 344,728       |         | 4.31e&#8209;10 |
+| High light scatter reticulocyte percentage                | 344,729       |         | 1.21e&#8209;05 |
+| Mean corpuscular volume                                   | 350,473       |         | 2.28e&#8209;05 |
+| Red blood cell (erythrocyte) distribution width           | 350,473       |         | 3.00e&#8209;05 |
+| Mean platelet (thrombocyte) volume                        | 350,470       |         | 6.75e&#8209;04 |
+| Mean corpuscular haemoglobin                              | 350,472       |         | 3.90e&#8209;03 |
+| Illnesses of adopted mother: Chronic bronchitis/emphysema | 2,938         | 238     | 1.92e&#8209;02 |
+
+Table: Significant trait associations of LV30 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv30 tag="S16"}
+<!-- LV30:phenomexcan_traits_assocs:end -->
+
+<!-- LV30:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV30 in eMERGE. {#tbl:sup:emerge_assocs:lv30 tag="S17"}
+<!-- LV30:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV30.**
+<!--  -->
+](images/lvs_analysis/lv30/lv30-cell_types.svg "Cell types for LV30"){#fig:sup:lv30 tag="S18" width="80%"}
+
+\clearpage
+
+#### LV730
+
+<!-- LV730:multiplier_pathways:start -->
+| Pathway    | AUC   | FDR      |
+|:-----------|:------|:---------|
+| DMAP MEGA2 | 0.82  | 2.64e-05 |
+
+Table: Pathways aligned to LV730 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv730 tag="S18"}
+<!-- LV730:multiplier_pathways:end -->
+
+<!-- LV730:phenomexcan_traits_assocs:start -->
+| Trait description                                                 | Sample size   | Cases   | FDR            |
+|:------------------------------------------------------------------|:--------------|:--------|:---------------|
+| Platelet distribution width                                       | 350,470       |         | 1.13e&#8209;10 |
+| Mean platelet (thrombocyte) volume                                | 350,470       |         | 3.47e&#8209;04 |
+| Reason former drinker stopped drinking alcohol: Financial reasons | 12,110        | 233     | 3.71e&#8209;02 |
+
+Table: Significant trait associations of LV730 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv730 tag="S19"}
+<!-- LV730:phenomexcan_traits_assocs:end -->
+
+<!-- LV730:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV730 in eMERGE. {#tbl:sup:emerge_assocs:lv730 tag="S20"}
+<!-- LV730:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV730.**
+<!--  -->
+](images/lvs_analysis/lv730/lv730-cell_types.svg "Cell types for LV730"){#fig:sup:lv730 tag="S19" width="80%"}
+
+\clearpage
+
+#### LV598
+
+<!-- LV598:multiplier_pathways:start -->
+| Pathway                     | AUC   | FDR      |
+|:----------------------------|:------|:---------|
+| PID SYNDECAN 1 PATHWAY      | 0.81  | 1.20e-02 |
+| REACTOME COLLAGEN FORMATION | 0.77  | 1.89e-02 |
+
+Table: Pathways aligned to LV598 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv598 tag="S21"}
+<!-- LV598:multiplier_pathways:end -->
+
+<!-- LV598:phenomexcan_traits_assocs:start -->
+| Trait description                                       | Sample size   | Cases   | FDR            |
+|:--------------------------------------------------------|:--------------|:--------|:---------------|
+| Corneal resistance factor (right)                       | 76,630        |         | 4.05e&#8209;04 |
+| Corneal resistance factor (left)                        | 76,510        |         | 1.86e&#8209;03 |
+| 6mm strong meridian (left)                              | 65,551        |         | 2.58e&#8209;03 |
+| Corneal hysteresis (right)                              | 76,630        |         | 1.21e&#8209;02 |
+| 6mm strong meridian (right)                             | 66,256        |         | 2.18e&#8209;02 |
+| Treatment/medication code: evening primrose oil product | 361,141       | 814     | 2.58e&#8209;02 |
+| 6mm weak meridian (left)                                | 65,551        |         | 3.67e&#8209;02 |
+| Hand grip strength (left)                               | 359,704       |         | 4.15e&#8209;02 |
+| 3mm strong meridian (left)                              | 75,398        |         | 4.74e&#8209;02 |
+
+Table: Significant trait associations of LV598 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv598 tag="S22"}
+<!-- LV598:phenomexcan_traits_assocs:end -->
+
+<!-- LV598:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV598 in eMERGE. {#tbl:sup:emerge_assocs:lv598 tag="S23"}
+<!-- LV598:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV598.**
+<!--  -->
+](images/lvs_analysis/lv598/lv598-cell_types.svg "Cell types for LV598"){#fig:sup:lv598 tag="S20" width="80%"}
+
+\clearpage
+
+#### LV57
+
+<!-- LV57:multiplier_pathways:start -->
+| Pathway                                | AUC   | FDR      |
+|:---------------------------------------|:------|:---------|
+| KEGG T CELL RECEPTOR SIGNALING PATHWAY | 0.70  | 1.26e-03 |
+| SVM T cells CD4 memory activated       | 0.79  | 2.59e-03 |
+| IRIS CD4Tcell-Th2-restimulated12hour   | 0.78  | 7.57e-03 |
+| KEGG ALLOGRAFT REJECTION               | 1.00  | 1.09e-02 |
+| Custom Treg                            | 0.98  | 1.37e-02 |
+| PID NFAT TFPATHWAY                     | 0.74  | 1.52e-02 |
+| IRIS MemoryTcell-RO-activated          | 0.70  | 2.87e-02 |
+
+Table: Pathways aligned to LV57 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv57 tag="S24"}
+<!-- LV57:multiplier_pathways:end -->
+
+<!-- LV57:phenomexcan_traits_assocs:start -->
+| Trait description                                                                                       | Sample size   | Cases         | FDR            |
+|:--------------------------------------------------------------------------------------------------------|:--------------|:--------------|:---------------|
+| Non-cancer illness code, self-reported: deep venous thrombosis (dvt)                                                       | 361,141       | 7,237         | 1.76e&#8209;13 |
+| Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor: Blood clot in the leg (DVT) | 360,527       | 7,386         | 1.22e&#8209;12 |
+| Diagnoses - main ICD10: I80 Phlebitis and thrombophlebitis                                                                 | 361,194       | 2,289         | 7.62e&#8209;12 |
+| DVT of lower extremities                                                                                                   | 361,194       | 2,116         | 1.27e&#8209;09 |
+| Venous thromboembolism                                                                                                     | 361,194       | 4,620         | 2.28e&#8209;08 |
+| DVT of lower extremities and pulmonary embolism                                                                            | 361,194       | 4,319         | 4.36e&#8209;08 |
+| Inflammatory Bowel Disease                                                                                                 | 34,652        | 12,882        | 1.95e&#8209;05 |
+| hypothyroidism (self-reported)                                                                                             | 361,141       | 17,574        | 3.84e&#8209;05 |
+| Medication: levothyroxine sodium                                                                                           | 361,141       | 14,689        | 8.43e&#8209;05 |
+| Mouth/teeth dental problems: Mouth ulcers                                                                                  | 359,841       | 36,831        | 1.02e&#8209;03 |
+| Crohns Disease                                                                                                             | 20,833        | 5,956         | 1.02e&#8209;02 |
+| Facial ageing                                                                                                              | 330,409       |               | 1.04e&#8209;02 |
+| Ulcerative Colitis                                                                                                         | 27,432        | 6,968         | 1.27e&#8209;02 |
+| Hair colour (natural, before greying): Black                                                                               | 360,270       | 15,809        | 1.99e&#8209;02 |
+| Hair colour (natural, before greying): Light brown                                                                         | 360,270       | 147,560       | 4.69e&#8209;02 |
+
+Table: Significant trait associations of LV57 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv57 tag="S25"}
+<!-- LV57:phenomexcan_traits_assocs:end -->
+
+<!-- LV57:emerge_traits_assocs:start -->
+| Phecode   | Trait description                    | Sample size   | Cases         | FDR            |
+|:----------|:-------------------------------------|:--------------|:--------------|:---------------|
+| 286       | Coagulation defects                  | 50,182        | 2,976         | 1.33e&#8209;11 |
+| 452       | Other venous embolism and thrombosis | 40,476        | 3,816         | 1.52e&#8209;05 |
+| 452.2     | Deep vein thrombosis [DVT]           | 38,791        | 2,131         | 4.47e&#8209;05 |
+| 244.4     | Hypothyroidism NOS                   | 53,968        | 9,284         | 1.12e&#8209;02 |
+| 244       | Hypothyroidism                       | 54,404        | 9,720         | 1.42e&#8209;02 |
+
+Table: Significant trait associations of LV57 in eMERGE. {#tbl:sup:emerge_assocs:lv57 tag="S26"}
+<!-- LV57:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV57.**
+](images/lvs_analysis/lv57/lv57-cell_types.svg "Cell types for LV57"){#fig:sup:lv57 tag="S21" width="80%"}
+
+\clearpage
+
+#### LV844
+
+<!-- LV844:multiplier_pathways:start -->
+| Pathway                                  | AUC   | FDR      |
+|:-----------------------------------------|:------|:---------|
+| KEGG ANTIGEN PROCESSING AND PRESENTATION | 0.80  | 1.35e-03 |
+
+Table: Pathways aligned to LV844 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv844 tag="S27"}
+<!-- LV844:multiplier_pathways:end -->
+
+<!-- LV844:phenomexcan_traits_assocs:start -->
+| Trait description                                               | Sample size   | Cases         | FDR            |
+|:----------------------------------------------------------------|:--------------|:--------------|:---------------|
+| Non-cancer illness code, self-reported: polymyalgia rheumatica  | 361,141       | 753           | 5.22e&#8209;06 |
+| Non-cancer illness code, self-reported: type 1 diabetes         | 361,141       | 318           | 4.71e&#8209;05 |
+| Type 1 diabetes with ketoacidosis                               | 361,194       | 168           | 1.03e&#8209;04 |
+| Age diabetes diagnosed                                          | 16,166        |               | 3.86e&#8209;04 |
+| Milk type used: Other type of milk                              | 360,806       | 4,213         | 5.48e&#8209;04 |
+| Non-cancer illness code, self-reported: appendicitis            | 361,141       | 3,058         | 6.12e&#8209;04 |
+| Diabetic ketoacidosis                                           | 361,194       | 234           | 7.13e&#8209;04 |
+| Rheumatoid Arthritis                                            | 80,799        | 19,234        | 7.46e&#8209;04 |
+| Type 1 diabetes without complications                           | 361,194       | 247           | 1.05e&#8209;03 |
+| Started insulin within one year diagnosis of diabetes           | 16,415        | 1,999         | 1.30e&#8209;03 |
+| Insulin medication (males)                                      | 165,340       | 2,248         | 3.61e&#8209;03 |
+| Medication: insulin product                                     | 361,141       | 3,545         | 5.48e&#8209;03 |
+| Insulin medication (females)                                    | 193,148       | 1,476         | 7.93e&#8209;03 |
+| Type 1 diabetes                                                 | 361,194       | 583           | 1.04e&#8209;02 |
+| Diagnoses - main ICD10: E10 Insulin-dependent diabetes mellitus | 361,194       | 470           | 1.08e&#8209;02 |
+| Treatment/medication code: sulfasalazine                        | 361,141       | 710           | 1.10e&#8209;02 |
+| malabsorption/coeliac disease (self-reported)                   | 361,141       | 1,587         | 3.12e&#8209;02 |
+| Job coding: school inspector, education inspector               | 89,866        | 238           | 3.71e&#8209;02 |
+| Seropositive rheumatoid arthritis                               | 361,194       | 327           | 3.86e&#8209;02 |
+| Non-cancer illness code, self-reported: rheumatoid arthritis    | 361,141       | 4,017         | 4.21e&#8209;02 |
+| Age hayfever or allergic rhinitis diagnosed by doctor           | 20,904        |               | 4.44e&#8209;02 |
+| Other/unspecified seropositiverheumatoid arthritis              | 361,194       | 299           | 4.88e&#8209;02 |
+
+Table: Significant trait associations of LV844 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv844 tag="S28"}
+<!-- LV844:phenomexcan_traits_assocs:end -->
+
+<!-- LV844:emerge_traits_assocs:start -->
+| Phecode                     | Trait description   | Sample size   | Cases   | FDR   |
+|:----------------------------|:--------------------|:--------------|:--------|:------|
+| No significant associations |                     |               |         |       |
+
+Table: Significant trait associations of LV844 in eMERGE. {#tbl:sup:emerge_assocs:lv844 tag="S29"}
+<!-- LV844:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV844.**
+](images/lvs_analysis/lv844/lv844-cell_types.svg "Cell types for LV844"){#fig:sup:lv844 tag="S22" width="80%"}
+
+\clearpage
+
+#### LV136
+
+<!-- LV136:multiplier_pathways:start -->
+| Pathway                       | AUC   | FDR      |
+|:------------------------------|:------|:---------|
+| PID INTEGRIN1 PATHWAY         | 0.88  | 9.35e-06 |
+| KEGG ECM RECEPTOR INTERACTION | 0.80  | 7.29e-05 |
+| REACTOME COLLAGEN FORMATION   | 0.87  | 2.00e-04 |
+| REACTOME MUSCLE CONTRACTION   | 0.75  | 1.49e-02 |
+
+Table: Pathways aligned to LV136 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv136 tag="S30"}
+<!-- LV136:multiplier_pathways:end -->
+
+<!-- LV136:phenomexcan_traits_assocs:start -->
+| Trait description                                               | Sample size   | Cases         | FDR            |
+|:----------------------------------------------------------------|:--------------|:--------------|:---------------|
+| Coronary atherosclerosis                                        | 361,194       | 14,334        | 1.84e&#8209;09 |
+| Chronic ischaemic heart disease (ICD10 I25)                     | 361,194       | 12,769        | 3.52e&#8209;09 |
+| Ischaemic heart disease (wide definition)                       | 361,194       | 20,857        | 3.95e&#8209;08 |
+| Coronary Artery Disease                                         | 184,305       | 60,801        | 4.18e&#8209;08 |
+| 3mm strong meridian (right)                                     | 75,410        |               | 5.54e&#8209;05 |
+| 6mm strong meridian (left)                                      | 65,551        |               | 1.35e&#8209;04 |
+| Corneal resistance factor (right)                               | 76,630        |               | 2.02e&#8209;04 |
+| 6mm strong meridian (right)                                     | 66,256        |               | 2.58e&#8209;04 |
+| Heart attack                                                    | 360,420       | 8,288         | 3.75e&#8209;04 |
+| Myocardial infarction                                           | 361,194       | 7,018         | 4.85e&#8209;04 |
+| Myocardial infarction, strict                                   | 361,194       | 7,018         | 4.85e&#8209;04 |
+| 3mm strong meridian (left)                                      | 75,398        |               | 6.65e&#8209;04 |
+| heart attack/myocardial infarction (self-reported)              | 361,141       | 8,239         | 1.07e&#8209;03 |
+| 6mm weak meridian (left)                                        | 65,551        |               | 1.10e&#8209;03 |
+| 6mm weak meridian (right)                                       | 66,256        |               | 1.61e&#8209;03 |
+| Acute myocardial infarction (ICD10 I21)                         | 361,194       | 5,948         | 2.24e&#8209;03 |
+| 3mm weak meridian (right)                                       | 75,410        |               | 3.69e&#8209;03 |
+| 3mm weak meridian (left)                                        | 75,398        |               | 3.96e&#8209;03 |
+| Intra-ocular pressure, Goldmann-correlated (right)              | 76,630        |               | 8.64e&#8209;03 |
+| 6mm asymmetry angle (right)                                     | 41,390        |               | 1.03e&#8209;02 |
+| Corneal resistance factor (left)                                | 76,510        |               | 1.03e&#8209;02 |
+| Other specified disorders of muscle                             | 361,194       | 257           | 1.09e&#8209;02 |
+| Major coronary heart disease event excluding revascularizations | 361,194       | 10,157        | 2.44e&#8209;02 |
+| Major coronary heart disease event                              | 361,194       | 10,157        | 2.44e&#8209;02 |
+| Non-cancer illness code, self-reported: angina                  | 361,141       | 11,370        | 4.53e&#8209;02 |
+| Eye problems/disorders: Glaucoma                                | 117,890       | 5,092         | 4.94e&#8209;02 |
+
+Table: Significant trait associations of LV136 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv136 tag="S31"}
+<!-- LV136:phenomexcan_traits_assocs:end -->
+
+<!-- LV136:emerge_traits_assocs:start -->
+| Phecode   | Trait description        | Sample size   | Cases   | FDR            |
+|:----------|:-------------------------|:--------------|:--------|:---------------|
+| 411.4     | Coronary atherosclerosis | 52,836        | 13,715  | 1.42e&#8209;03 |
+
+Table: Significant trait associations of LV136 in eMERGE. {#tbl:sup:emerge_assocs:lv136 tag="S32"}
+<!-- LV136:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV136.**
+<!--  -->
+Pulmonary microvascular endothelial cells were exposed to hypoxia for 24 hours or more [@url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA232177];
+](images/lvs_analysis/lv136/lv136-cell_types.svg "Cell types for LV136"){#fig:sup:lv136 tag="S23" width="80%"}
+
+\clearpage
+
+#### LV21
+
+<!-- LV21:multiplier_pathways:start -->
+| Pathway                            | AUC   | FDR   |
+|:-----------------------------------|:------|:------|
+| No pathways significantly enriched |       |       |
+
+Table: Pathways aligned to LV21 from the MultiPLIER models. {#tbl:sup:multiplier_pathways:lv21 tag="S33"}
+<!-- LV21:multiplier_pathways:end -->
+
+<!-- LV21:phenomexcan_traits_assocs:start -->
+| Trait description                                              | Sample size   | Cases         | FDR            |
+|:---------------------------------------------------------------|:--------------|:--------------|:---------------|
+| LDL Cholesterol NMR                                            | 13,527        |               | 1.08e&#8209;12 |
+| HDL Cholesterol NMR                                            | 19,270        |               | 3.03e&#8209;11 |
+| Alzheimers Disease                                             | 54,162        | 17,008        | 1.96e&#8209;09 |
+| Triglycerides NMR                                              | 21,559        |               | 2.05e&#8209;09 |
+| Illnesses of mother: Alzheimer's disease/dementia              | 331,041       | 28,507        | 1.36e&#8209;08 |
+| Illnesses of father: Alzheimer's disease/dementia              | 312,666       | 15,022        | 3.15e&#8209;08 |
+| Illnesses of siblings: Alzheimer's disease/dementia            | 279,062       | 1,609         | 1.55e&#8209;07 |
+| Any dementia                                                   | 361,194       | 243           | 5.63e&#8209;07 |
+| Treatment/medication code: simvastatin                         | 361,141       | 40,921        | 2.88e&#8209;06 |
+| Cholesterol lowering medication                                | 193,148       | 24,247        | 4.45e&#8209;05 |
+| High cholesterol (self-reported)                               | 361,141       | 43,957        | 9.90e&#8209;05 |
+| Cholesterol lowering medication                                | 165,340       | 38,057        | 4.86e&#8209;04 |
+| Dementia                                                       | 361,194       | 157           | 9.80e&#8209;04 |
+| Alzheimer’s disease                                            | 361,194       | 119           | 1.33e&#8209;03 |
+| Mean reticulocyte volume                                       | 344,728       |               | 1.76e&#8209;03 |
+| Father's age at death                                          | 266,231       |               | 6.68e&#8209;03 |
+| Illnesses of mother: None of the above (group 1)               | 332,611       | 138,291       | 1.42e&#8209;02 |
+| ECG, phase time                                                | 53,998        |               | 1.60e&#8209;02 |
+| Treatment/medication code: atorvastatin                        | 361,141       | 10,805        | 2.92e&#8209;02 |
+| Mean sphered cell volume                                       | 344,729       |               | 3.33e&#8209;02 |
+| Non-cancer illness code, self-reported: cellulitis             | 361,141       | 232           | 3.40e&#8209;02 |
+| Medication for cholesterol, blood pressure or diabetes (males) | 165,340       | 110,372       | 3.66e&#8209;02 |
+| Mother still alive                                             | 355,029       | 140,246       | 4.96e&#8209;02 |
+
+Table: Significant trait associations of LV21 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv21 tag="S34"}
+<!-- LV21:phenomexcan_traits_assocs:end -->
+
+<!-- LV21:emerge_traits_assocs:start -->
+| Phecode   | Trait description              | Sample size   | Cases         | FDR            |
+|:----------|:-------------------------------|:--------------|:--------------|:---------------|
+| 272.1     | Hyperlipidemia                 | 55,843        | 29,195        | 4.22e&#8209;03 |
+| 272       | Disorders of lipoid metabolism | 55,892        | 29,244        | 4.50e&#8209;03 |
+
+Table: Significant trait associations of LV21 in eMERGE. {#tbl:sup:emerge_assocs:lv21 tag="S35"}
+<!-- LV21:emerge_traits_assocs:end -->
+
+![
+**Cell types for LV21.**
+<!--  -->
+](images/lvs_analysis/lv21/lv21-cell_types.svg "Cell types for LV21"){#fig:sup:lv21 tag="S24" width="80%"}
diff --git a/tests/manuscripts/phenoplier_full/ai_revision-config.yaml b/tests/manuscripts/phenoplier_full/ai_revision-config.yaml
new file mode 100644
index 0000000..d166741
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/ai_revision-config.yaml
@@ -0,0 +1,25 @@
+files:
+  matchings:
+    - files:
+        - abstract
+      prompt: abstract
+    - files:
+        - introduction
+      prompt: introduction_discussion
+    - files:
+        - 04\..+\.md
+      prompt: results
+    - files:
+        - discussion
+      prompt: introduction_discussion
+    - files:
+        - methods
+      prompt: methods
+  
+  default_prompt: default prompt text
+  
+  ignore:
+    - front\-matter
+    - acknowledgements
+    - supplementary_material
+    - references
\ No newline at end of file
diff --git a/tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml b/tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml
new file mode 100644
index 0000000..acb4594
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml
@@ -0,0 +1,15 @@
+prompts:
+  abstract: |
+    Test match abstract.
+
+  introduction_discussion: |
+    Test match introduction or discussion.
+
+  results: |
+    Test match results.
+
+  methods: |
+    Test match methods.
+
+  default: |
+    Proofread the following paragraph
diff --git a/tests/manuscripts/phenoplier_full/manual-references.json b/tests/manuscripts/phenoplier_full/manual-references.json
new file mode 100644
index 0000000..8ae1023
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/manual-references.json
@@ -0,0 +1,9296 @@
+[
+  {
+    "type": "article-journal",
+    "id": "MxGpAiPu",
+    "container-title": "Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms",
+    "author": [
+      {
+        "family": "Arthur",
+        "given": "David"
+      },
+      {
+        "family": "Vassilvitskii",
+        "given": "Sergei"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2007
+        ]
+      ]
+    },
+    "page": "1027-1035",
+    "title": "k-means++: the advantages of careful seeding",
+    "URL": "http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Arthur2007"
+  },
+  {
+    "type": "article-journal",
+    "id": "JrL3iQea",
+    "container-title": "Proceedings of the Second International Conference on Knowledge Discovery and Data Mining",
+    "author": [
+      {
+        "family": "Ester",
+        "given": "Martin"
+      },
+      {
+        "family": "Kriegel",
+        "given": "Hans-Peter"
+      },
+      {
+        "family": "Sander",
+        "given": "Jörg"
+      },
+      {
+        "family": "Xu",
+        "given": "Xiaowei"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          1996
+        ]
+      ]
+    },
+    "page": "226-231",
+    "title": "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise",
+    "URL": "https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Ester1996"
+  },
+  {
+    "type": "article-journal",
+    "id": "x3CT24TB",
+    "container-title": "Advances in Neural Information Processing Systems",
+    "author": [
+      {
+        "family": "Ng",
+        "given": "Andrew"
+      },
+      {
+        "family": "Jordan",
+        "given": "Michael"
+      },
+      {
+        "family": "Weiss",
+        "given": "Yair"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2001
+        ]
+      ]
+    },
+    "page": "849-856",
+    "title": "On Spectral Clustering: Analysis and an algorithm",
+    "URL": "https://ai.stanford.edu/~ang/papers/nips01-spectral.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Ng2001"
+  },
+  {
+    "type": "article-journal",
+    "id": "uw3AnEgA",
+    "container-title": "Journal of Machine Learning Research",
+    "author": [
+      {
+        "family": "Strehl",
+        "given": "Alexander"
+      },
+      {
+        "family": "Joydeep",
+        "given": "Ghosh"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2002
+        ]
+      ]
+    },
+    "volume": "3",
+    "page": "583-617",
+    "title": "Cluster Ensembles – A Knowledge Reuse Framework for Combining Multiple Partitions",
+    "URL": "https://www.jmlr.org/papers/v3/strehl02a.html",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Strehl2002"
+  },
+  {
+    "id": "157h5hA34",
+    "URL": "https://arxiv.org/abs/1802.03426",
+    "number": "1802.03426",
+    "title": "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          21
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Leland",
+        "family": "McInnes"
+      },
+      {
+        "given": "John",
+        "family": "Healy"
+      },
+      {
+        "given": "James",
+        "family": "Melville"
+      }
+    ],
+    "container-title": "arXiv",
+    "publisher": "arXiv",
+    "type": "report",
+    "abstract": "  UMAP (Uniform Manifold Approximation and Projection) is a novel manifold learning technique for dimension reduction. UMAP is constructed from a theoretical framework based in Riemannian geometry and algebraic topology. The result is a practical scalable algorithm that applies to real world data. The UMAP algorithm is competitive with t-SNE for visualization quality, and arguably preserves more of the global structure with superior run time performance. Furthermore, UMAP has no computational restrictions on embedding dimension, making it viable as a general purpose dimension reduction technique for machine learning. ",
+    "note": "license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: arxiv:1802.03426"
+  },
+  {
+    "publisher": "Wiley",
+    "issue": "8",
+    "DOI": "10.1002/gepi.22346",
+    "type": "article-journal",
+    "page": "854-867",
+    "source": "Crossref",
+    "title": "Fine‐mapping and QTL tissue‐sharing information improves the reliability of causal gene identification",
+    "volume": "44",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Owen J.",
+        "family": "Melia"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Gao",
+        "family": "Wang"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "François",
+        "family": "Aguet"
+      },
+      {
+        "given": "Kristin G.",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Hae K.",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Genetic Epidemiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqsvf7",
+    "container-title-short": "Genetic Epidemiology",
+    "PMCID": "PMC7693040",
+    "PMID": "32964524",
+    "id": "SiobXsoB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1002/gepi.22346"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1007/bf01908075",
+    "type": "article-journal",
+    "page": "193-218",
+    "source": "Crossref",
+    "title": "Comparing partitions",
+    "volume": "2",
+    "author": [
+      {
+        "given": "Lawrence",
+        "family": "Hubert"
+      },
+      {
+        "given": "Phipps",
+        "family": "Arabie"
+      }
+    ],
+    "container-title": "Journal of Classification",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          1985,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bphmzh",
+    "container-title-short": "Journal of Classification",
+    "id": "e4AuoW8N",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1007/bf01908075"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "5",
+    "DOI": "10.1016/j.ajhg.2018.04.002",
+    "type": "article-journal",
+    "page": "717-730",
+    "source": "Crossref",
+    "title": "The Post-GWAS Era: From Association to Function",
+    "volume": "102",
+    "author": [
+      {
+        "given": "Michael D.",
+        "family": "Gallagher"
+      },
+      {
+        "given": "Alice S.",
+        "family": "Chen-Plotkin"
+      }
+    ],
+    "container-title": "The American Journal of Human Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdmftd",
+    "container-title-short": "The American Journal of Human Genetics",
+    "PMCID": "PMC5986732",
+    "PMID": "29727686",
+    "id": "aIyQY5ZT",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ajhg.2018.04.002"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "1",
+    "DOI": "10.1016/j.ajhg.2020.11.012",
+    "type": "article-journal",
+    "page": "25-35",
+    "source": "Crossref",
+    "title": "Probabilistic colocalization of genetic variants from complex and molecular traits: promise and limitations",
+    "volume": "108",
+    "author": [
+      {
+        "given": "Abhay",
+        "family": "Hukku"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Francesca",
+        "family": "Luca"
+      },
+      {
+        "given": "Roger",
+        "family": "Pique-Regi"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      }
+    ],
+    "container-title": "The American Journal of Human Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj58gg",
+    "container-title-short": "The American Journal of Human Genetics",
+    "PMCID": "PMC7820626",
+    "PMID": "33308443",
+    "id": "ndd3tW4g",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ajhg.2020.11.012"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "8",
+    "DOI": "10.1016/j.amjcard.2008.02.029",
+    "type": "article-journal",
+    "page": "S20-S26",
+    "source": "Crossref",
+    "title": "Mechanism of Action of Niacin",
+    "volume": "101",
+    "author": [
+      {
+        "given": "Vaijinath S.",
+        "family": "Kamanna"
+      },
+      {
+        "given": "Moti L.",
+        "family": "Kashyap"
+      }
+    ],
+    "container-title": "The American Journal of Cardiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/c8zwdt",
+    "container-title-short": "The American Journal of Cardiology",
+    "PMID": "18375237",
+    "id": "LVihFr3g",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.amjcard.2008.02.029"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "2",
+    "DOI": "10.1016/j.cell.2011.01.004",
+    "type": "article-journal",
+    "page": "296-309",
+    "source": "Crossref",
+    "title": "Densely Interconnected Transcriptional Circuits Control Cell States in Human Hematopoiesis",
+    "volume": "144",
+    "author": [
+      {
+        "given": "Noa",
+        "family": "Novershtern"
+      },
+      {
+        "given": "Aravind",
+        "family": "Subramanian"
+      },
+      {
+        "given": "Lee N.",
+        "family": "Lawton"
+      },
+      {
+        "given": "Raymond H.",
+        "family": "Mak"
+      },
+      {
+        "given": "W. Nicholas",
+        "family": "Haining"
+      },
+      {
+        "given": "Marie E.",
+        "family": "McConkey"
+      },
+      {
+        "given": "Naomi",
+        "family": "Habib"
+      },
+      {
+        "given": "Nir",
+        "family": "Yosef"
+      },
+      {
+        "given": "Cindy Y.",
+        "family": "Chang"
+      },
+      {
+        "given": "Tal",
+        "family": "Shay"
+      },
+      {
+        "given": "Garrett M.",
+        "family": "Frampton"
+      },
+      {
+        "given": "Adam C.B.",
+        "family": "Drake"
+      },
+      {
+        "given": "Ilya",
+        "family": "Leskov"
+      },
+      {
+        "given": "Bjorn",
+        "family": "Nilsson"
+      },
+      {
+        "given": "Fred",
+        "family": "Preffer"
+      },
+      {
+        "given": "David",
+        "family": "Dombkowski"
+      },
+      {
+        "given": "John W.",
+        "family": "Evans"
+      },
+      {
+        "given": "Ted",
+        "family": "Liefeld"
+      },
+      {
+        "given": "John S.",
+        "family": "Smutko"
+      },
+      {
+        "given": "Jianzhu",
+        "family": "Chen"
+      },
+      {
+        "given": "Nir",
+        "family": "Friedman"
+      },
+      {
+        "given": "Richard A.",
+        "family": "Young"
+      },
+      {
+        "given": "Todd R.",
+        "family": "Golub"
+      },
+      {
+        "given": "Aviv",
+        "family": "Regev"
+      },
+      {
+        "given": "Benjamin L.",
+        "family": "Ebert"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cf5k92",
+    "container-title-short": "Cell",
+    "PMCID": "PMC3049864",
+    "PMID": "21241896",
+    "id": "Zk82GvJV",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2011.01.004"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "7",
+    "DOI": "10.1016/j.cell.2017.05.038",
+    "type": "article-journal",
+    "page": "1177-1186",
+    "source": "Crossref",
+    "title": "An Expanded View of Complex Traits: From Polygenic to Omnigenic",
+    "volume": "169",
+    "author": [
+      {
+        "given": "Evan A.",
+        "family": "Boyle"
+      },
+      {
+        "given": "Yang I.",
+        "family": "Li"
+      },
+      {
+        "given": "Jonathan K.",
+        "family": "Pritchard"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcpgdz",
+    "container-title-short": "Cell",
+    "PMCID": "PMC5536862",
+    "PMID": "28622505",
+    "id": "vpIDZCSa",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2017.05.038"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "6",
+    "DOI": "10.1016/j.cell.2017.10.049",
+    "type": "article-journal",
+    "page": "1437-1452.e17",
+    "source": "Crossref",
+    "title": "A Next Generation Connectivity Map: L1000 Platform and the First 1,000,000 Profiles",
+    "volume": "171",
+    "author": [
+      {
+        "given": "Aravind",
+        "family": "Subramanian"
+      },
+      {
+        "given": "Rajiv",
+        "family": "Narayan"
+      },
+      {
+        "given": "Steven M.",
+        "family": "Corsello"
+      },
+      {
+        "given": "David D.",
+        "family": "Peck"
+      },
+      {
+        "given": "Ted E.",
+        "family": "Natoli"
+      },
+      {
+        "given": "Xiaodong",
+        "family": "Lu"
+      },
+      {
+        "given": "Joshua",
+        "family": "Gould"
+      },
+      {
+        "given": "John F.",
+        "family": "Davis"
+      },
+      {
+        "given": "Andrew A.",
+        "family": "Tubelli"
+      },
+      {
+        "given": "Jacob K.",
+        "family": "Asiedu"
+      },
+      {
+        "given": "David L.",
+        "family": "Lahr"
+      },
+      {
+        "given": "Jodi E.",
+        "family": "Hirschman"
+      },
+      {
+        "given": "Zihan",
+        "family": "Liu"
+      },
+      {
+        "given": "Melanie",
+        "family": "Donahue"
+      },
+      {
+        "given": "Bina",
+        "family": "Julian"
+      },
+      {
+        "given": "Mariya",
+        "family": "Khan"
+      },
+      {
+        "given": "David",
+        "family": "Wadden"
+      },
+      {
+        "given": "Ian C.",
+        "family": "Smith"
+      },
+      {
+        "given": "Daniel",
+        "family": "Lam"
+      },
+      {
+        "given": "Arthur",
+        "family": "Liberzon"
+      },
+      {
+        "given": "Courtney",
+        "family": "Toder"
+      },
+      {
+        "given": "Mukta",
+        "family": "Bagul"
+      },
+      {
+        "given": "Marek",
+        "family": "Orzechowski"
+      },
+      {
+        "given": "Oana M.",
+        "family": "Enache"
+      },
+      {
+        "given": "Federica",
+        "family": "Piccioni"
+      },
+      {
+        "given": "Sarah A.",
+        "family": "Johnson"
+      },
+      {
+        "given": "Nicholas J.",
+        "family": "Lyons"
+      },
+      {
+        "given": "Alice H.",
+        "family": "Berger"
+      },
+      {
+        "given": "Alykhan F.",
+        "family": "Shamji"
+      },
+      {
+        "given": "Angela N.",
+        "family": "Brooks"
+      },
+      {
+        "given": "Anita",
+        "family": "Vrcic"
+      },
+      {
+        "given": "Corey",
+        "family": "Flynn"
+      },
+      {
+        "given": "Jacqueline",
+        "family": "Rosains"
+      },
+      {
+        "given": "David Y.",
+        "family": "Takeda"
+      },
+      {
+        "given": "Roger",
+        "family": "Hu"
+      },
+      {
+        "given": "Desiree",
+        "family": "Davison"
+      },
+      {
+        "given": "Justin",
+        "family": "Lamb"
+      },
+      {
+        "given": "Kristin",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Larson",
+        "family": "Hogstrom"
+      },
+      {
+        "given": "Peyton",
+        "family": "Greenside"
+      },
+      {
+        "given": "Nathanael S.",
+        "family": "Gray"
+      },
+      {
+        "given": "Paul A.",
+        "family": "Clemons"
+      },
+      {
+        "given": "Serena",
+        "family": "Silver"
+      },
+      {
+        "given": "Xiaoyun",
+        "family": "Wu"
+      },
+      {
+        "given": "Wen-Ning",
+        "family": "Zhao"
+      },
+      {
+        "given": "Willis",
+        "family": "Read-Button"
+      },
+      {
+        "given": "Xiaohua",
+        "family": "Wu"
+      },
+      {
+        "given": "Stephen J.",
+        "family": "Haggarty"
+      },
+      {
+        "given": "Lucienne V.",
+        "family": "Ronco"
+      },
+      {
+        "given": "Jesse S.",
+        "family": "Boehm"
+      },
+      {
+        "given": "Stuart L.",
+        "family": "Schreiber"
+      },
+      {
+        "given": "John G.",
+        "family": "Doench"
+      },
+      {
+        "given": "Joshua A.",
+        "family": "Bittker"
+      },
+      {
+        "given": "David E.",
+        "family": "Root"
+      },
+      {
+        "given": "Bang",
+        "family": "Wong"
+      },
+      {
+        "given": "Todd R.",
+        "family": "Golub"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cgwt",
+    "container-title-short": "Cell",
+    "PMCID": "PMC5990023",
+    "PMID": "29195078",
+    "id": "F7lIlh2N",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2017.10.049"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "4",
+    "DOI": "10.1016/j.cell.2019.04.014",
+    "type": "article-journal",
+    "page": "1022-1034.e6",
+    "source": "Crossref",
+    "title": "Trans Effects on Gene Expression Can Drive Omnigenic Inheritance",
+    "volume": "177",
+    "author": [
+      {
+        "given": "Xuanyao",
+        "family": "Liu"
+      },
+      {
+        "given": "Yang I.",
+        "family": "Li"
+      },
+      {
+        "given": "Jonathan K.",
+        "family": "Pritchard"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfz8bj",
+    "container-title-short": "Cell",
+    "PMCID": "PMC6553491",
+    "PMID": "31051098",
+    "id": "LXvTZzEA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2019.04.014"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "6",
+    "DOI": "10.1016/j.cels.2015.12.004",
+    "type": "article-journal",
+    "page": "417-425",
+    "source": "Crossref",
+    "title": "The Molecular Signatures Database Hallmark Gene Set Collection",
+    "volume": "1",
+    "author": [
+      {
+        "given": "Arthur",
+        "family": "Liberzon"
+      },
+      {
+        "given": "Chet",
+        "family": "Birger"
+      },
+      {
+        "given": "Helga",
+        "family": "Thorvaldsdóttir"
+      },
+      {
+        "given": "Mahmoud",
+        "family": "Ghandi"
+      },
+      {
+        "given": "Jill P.",
+        "family": "Mesirov"
+      },
+      {
+        "given": "Pablo",
+        "family": "Tamayo"
+      }
+    ],
+    "container-title": "Cell Systems",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf78hq",
+    "container-title-short": "Cell Systems",
+    "PMCID": "PMC4707969",
+    "PMID": "26771021",
+    "id": "1CbVoEpNJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cels.2015.12.004"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "5",
+    "DOI": "10.1016/j.cels.2019.04.003",
+    "type": "article-journal",
+    "page": "380-394.e4",
+    "source": "Crossref",
+    "title": "MultiPLIER: A Transfer Learning Framework for Transcriptomics Reveals Systemic Features of Rare Disease",
+    "volume": "8",
+    "author": [
+      {
+        "given": "Jaclyn N.",
+        "family": "Taroni"
+      },
+      {
+        "given": "Peter C.",
+        "family": "Grayson"
+      },
+      {
+        "given": "Qiwen",
+        "family": "Hu"
+      },
+      {
+        "given": "Sean",
+        "family": "Eddy"
+      },
+      {
+        "given": "Matthias",
+        "family": "Kretzler"
+      },
+      {
+        "given": "Peter A.",
+        "family": "Merkel"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      }
+    ],
+    "container-title": "Cell Systems",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75g5",
+    "container-title-short": "Cell Systems",
+    "PMCID": "PMC6538307",
+    "PMID": "31121115",
+    "id": "14rnBunuZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cels.2019.04.003"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "DOI": "10.1016/j.ins.2016.04.027",
+    "type": "article-journal",
+    "page": "120-134",
+    "source": "Crossref",
+    "title": "Diversity control for improving the analysis of consensus clustering",
+    "volume": "361-362",
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Georgina",
+        "family": "Stegmayer"
+      },
+      {
+        "given": "Diego H.",
+        "family": "Milone"
+      }
+    ],
+    "container-title": "Information Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghtqbk",
+    "container-title-short": "Information Sciences",
+    "id": "8js8Q3pF",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ins.2016.04.027"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "10",
+    "DOI": "10.1016/j.patcog.2014.04.005",
+    "type": "article-journal",
+    "page": "3362-3375",
+    "source": "Crossref",
+    "title": "Hybrid clustering solution selection strategy",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Zhiwen",
+        "family": "Yu"
+      },
+      {
+        "given": "Le",
+        "family": "Li"
+      },
+      {
+        "given": "Yunjun",
+        "family": "Gao"
+      },
+      {
+        "given": "Jane",
+        "family": "You"
+      },
+      {
+        "given": "Jiming",
+        "family": "Liu"
+      },
+      {
+        "given": "Hau-San",
+        "family": "Wong"
+      },
+      {
+        "given": "Guoqiang",
+        "family": "Han"
+      }
+    ],
+    "container-title": "Pattern Recognition",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghtzwt",
+    "container-title-short": "Pattern Recognition",
+    "id": "t5p3UpxZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.patcog.2014.04.005"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "7",
+    "DOI": "10.1016/j.tips.2006.05.008",
+    "type": "article-journal",
+    "page": "384-390",
+    "source": "Crossref",
+    "title": "The nicotinic acid receptor GPR109A (HM74A or PUMA-G) as a new therapeutic target",
+    "volume": "27",
+    "author": [
+      {
+        "given": "S",
+        "family": "OFFERMANNS"
+      }
+    ],
+    "container-title": "Trends in Pharmacological Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          7
+        ]
+      ]
+    },
+    "URL": "https://doi.org/fgb4tr",
+    "container-title-short": "Trends in Pharmacological Sciences",
+    "PMID": "16766048",
+    "id": "izbPw2kc",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.tips.2006.05.008"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "10",
+    "DOI": "10.1038/gim.2013.72",
+    "type": "article-journal",
+    "page": "761-771",
+    "source": "Crossref",
+    "title": "The Electronic Medical Records and Genomics (eMERGE) Network: past, present, and future",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Omri",
+        "family": "Gottesman"
+      },
+      {
+        "given": "Helena",
+        "family": "Kuivaniemi"
+      },
+      {
+        "given": "Gerard",
+        "family": "Tromp"
+      },
+      {
+        "given": "W. Andrew",
+        "family": "Faucett"
+      },
+      {
+        "given": "Rongling",
+        "family": "Li"
+      },
+      {
+        "given": "Teri A.",
+        "family": "Manolio"
+      },
+      {
+        "given": "Saskia C.",
+        "family": "Sanderson"
+      },
+      {
+        "given": "Joseph",
+        "family": "Kannry"
+      },
+      {
+        "given": "Randi",
+        "family": "Zinberg"
+      },
+      {
+        "given": "Melissa A.",
+        "family": "Basford"
+      },
+      {
+        "given": "Murray",
+        "family": "Brilliant"
+      },
+      {
+        "given": "David J.",
+        "family": "Carey"
+      },
+      {
+        "given": "Rex L.",
+        "family": "Chisholm"
+      },
+      {
+        "given": "Christopher G.",
+        "family": "Chute"
+      },
+      {
+        "given": "John J.",
+        "family": "Connolly"
+      },
+      {
+        "given": "David",
+        "family": "Crosslin"
+      },
+      {
+        "given": "Joshua C.",
+        "family": "Denny"
+      },
+      {
+        "given": "Carlos J.",
+        "family": "Gallego"
+      },
+      {
+        "given": "Jonathan L.",
+        "family": "Haines"
+      },
+      {
+        "given": "Hakon",
+        "family": "Hakonarson"
+      },
+      {
+        "given": "John",
+        "family": "Harley"
+      },
+      {
+        "given": "Gail P.",
+        "family": "Jarvik"
+      },
+      {
+        "given": "Isaac",
+        "family": "Kohane"
+      },
+      {
+        "given": "Iftikhar J.",
+        "family": "Kullo"
+      },
+      {
+        "given": "Eric B.",
+        "family": "Larson"
+      },
+      {
+        "given": "Catherine",
+        "family": "McCarty"
+      },
+      {
+        "given": "Marylyn D.",
+        "family": "Ritchie"
+      },
+      {
+        "given": "Dan M.",
+        "family": "Roden"
+      },
+      {
+        "given": "Maureen E.",
+        "family": "Smith"
+      },
+      {
+        "given": "Erwin P.",
+        "family": "Böttinger"
+      },
+      {
+        "given": "Marc S.",
+        "family": "Williams"
+      }
+    ],
+    "container-title": "Genetics in Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f5dwbt",
+    "container-title-short": "Genetics in Medicine",
+    "PMCID": "PMC3795928",
+    "PMID": "23743551",
+    "id": "wfqjCerX",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/gim.2013.72"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7414",
+    "DOI": "10.1038/nature11247",
+    "type": "article-journal",
+    "page": "57-74",
+    "source": "Crossref",
+    "title": "An integrated encyclopedia of DNA elements in the human genome",
+    "volume": "489",
+    "author": [
+      {}
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2012,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bg9d",
+    "container-title-short": "Nature",
+    "PMCID": "PMC3439153",
+    "PMID": "22955616",
+    "id": "15J98V2qM",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature11247"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7493",
+    "DOI": "10.1038/nature12787",
+    "type": "article-journal",
+    "page": "455-461",
+    "source": "Crossref",
+    "title": "An atlas of active enhancers across human cell types and tissues",
+    "volume": "507",
+    "author": [
+      {
+        "given": "Robin",
+        "family": "Andersson"
+      },
+      {},
+      {
+        "given": "Claudia",
+        "family": "Gebhard"
+      },
+      {
+        "given": "Irene",
+        "family": "Miguel-Escalada"
+      },
+      {
+        "given": "Ilka",
+        "family": "Hoof"
+      },
+      {
+        "given": "Jette",
+        "family": "Bornholdt"
+      },
+      {
+        "given": "Mette",
+        "family": "Boyd"
+      },
+      {
+        "given": "Yun",
+        "family": "Chen"
+      },
+      {
+        "given": "Xiaobei",
+        "family": "Zhao"
+      },
+      {
+        "given": "Christian",
+        "family": "Schmidl"
+      },
+      {
+        "given": "Takahiro",
+        "family": "Suzuki"
+      },
+      {
+        "given": "Evgenia",
+        "family": "Ntini"
+      },
+      {
+        "given": "Erik",
+        "family": "Arner"
+      },
+      {
+        "given": "Eivind",
+        "family": "Valen"
+      },
+      {
+        "given": "Kang",
+        "family": "Li"
+      },
+      {
+        "given": "Lucia",
+        "family": "Schwarzfischer"
+      },
+      {
+        "given": "Dagmar",
+        "family": "Glatz"
+      },
+      {
+        "given": "Johanna",
+        "family": "Raithel"
+      },
+      {
+        "given": "Berit",
+        "family": "Lilje"
+      },
+      {
+        "given": "Nicolas",
+        "family": "Rapin"
+      },
+      {
+        "given": "Frederik Otzen",
+        "family": "Bagger"
+      },
+      {
+        "given": "Mette",
+        "family": "Jørgensen"
+      },
+      {
+        "given": "Peter Refsing",
+        "family": "Andersen"
+      },
+      {
+        "given": "Nicolas",
+        "family": "Bertin"
+      },
+      {
+        "given": "Owen",
+        "family": "Rackham"
+      },
+      {
+        "given": "A. Maxwell",
+        "family": "Burroughs"
+      },
+      {
+        "given": "J. Kenneth",
+        "family": "Baillie"
+      },
+      {
+        "given": "Yuri",
+        "family": "Ishizu"
+      },
+      {
+        "given": "Yuri",
+        "family": "Shimizu"
+      },
+      {
+        "given": "Erina",
+        "family": "Furuhata"
+      },
+      {
+        "given": "Shiori",
+        "family": "Maeda"
+      },
+      {
+        "given": "Yutaka",
+        "family": "Negishi"
+      },
+      {
+        "given": "Christopher J.",
+        "family": "Mungall"
+      },
+      {
+        "given": "Terrence F.",
+        "family": "Meehan"
+      },
+      {
+        "given": "Timo",
+        "family": "Lassmann"
+      },
+      {
+        "given": "Masayoshi",
+        "family": "Itoh"
+      },
+      {
+        "given": "Hideya",
+        "family": "Kawaji"
+      },
+      {
+        "given": "Naoto",
+        "family": "Kondo"
+      },
+      {
+        "given": "Jun",
+        "family": "Kawai"
+      },
+      {
+        "given": "Andreas",
+        "family": "Lennartsson"
+      },
+      {
+        "given": "Carsten O.",
+        "family": "Daub"
+      },
+      {
+        "given": "Peter",
+        "family": "Heutink"
+      },
+      {
+        "given": "David A.",
+        "family": "Hume"
+      },
+      {
+        "given": "Torben Heick",
+        "family": "Jensen"
+      },
+      {
+        "given": "Harukazu",
+        "family": "Suzuki"
+      },
+      {
+        "given": "Yoshihide",
+        "family": "Hayashizaki"
+      },
+      {
+        "given": "Ferenc",
+        "family": "Müller"
+      },
+      {
+        "given": "Alistair R. R.",
+        "family": "Forrest"
+      },
+      {
+        "given": "Piero",
+        "family": "Carninci"
+      },
+      {
+        "given": "Michael",
+        "family": "Rehli"
+      },
+      {
+        "given": "Albin",
+        "family": "Sandelin"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/r35",
+    "container-title-short": "Nature",
+    "PMCID": "PMC5215096",
+    "PMID": "24670763",
+    "id": "SxuuTQTQ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature12787"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7539",
+    "DOI": "10.1038/nature14248",
+    "type": "article-journal",
+    "page": "317-330",
+    "source": "Crossref",
+    "title": "Integrative analysis of 111 reference human epigenomes",
+    "volume": "518",
+    "author": [
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      },
+      {},
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Jason",
+        "family": "Ernst"
+      },
+      {
+        "given": "Misha",
+        "family": "Bilenky"
+      },
+      {
+        "given": "Angela",
+        "family": "Yen"
+      },
+      {
+        "given": "Alireza",
+        "family": "Heravi-Moussavi"
+      },
+      {
+        "given": "Pouya",
+        "family": "Kheradpour"
+      },
+      {
+        "given": "Zhizhuo",
+        "family": "Zhang"
+      },
+      {
+        "given": "Jianrong",
+        "family": "Wang"
+      },
+      {
+        "given": "Michael J.",
+        "family": "Ziller"
+      },
+      {
+        "given": "Viren",
+        "family": "Amin"
+      },
+      {
+        "given": "John W.",
+        "family": "Whitaker"
+      },
+      {
+        "given": "Matthew D.",
+        "family": "Schultz"
+      },
+      {
+        "given": "Lucas D.",
+        "family": "Ward"
+      },
+      {
+        "given": "Abhishek",
+        "family": "Sarkar"
+      },
+      {
+        "given": "Gerald",
+        "family": "Quon"
+      },
+      {
+        "given": "Richard S.",
+        "family": "Sandstrom"
+      },
+      {
+        "given": "Matthew L.",
+        "family": "Eaton"
+      },
+      {
+        "given": "Yi-Chieh",
+        "family": "Wu"
+      },
+      {
+        "given": "Andreas R.",
+        "family": "Pfenning"
+      },
+      {
+        "given": "Xinchen",
+        "family": "Wang"
+      },
+      {
+        "given": "Melina",
+        "family": "Claussnitzer"
+      },
+      {
+        "given": "Yaping",
+        "family": "Liu"
+      },
+      {
+        "given": "Cristian",
+        "family": "Coarfa"
+      },
+      {
+        "given": "R. Alan",
+        "family": "Harris"
+      },
+      {
+        "given": "Noam",
+        "family": "Shoresh"
+      },
+      {
+        "given": "Charles B.",
+        "family": "Epstein"
+      },
+      {
+        "given": "Elizabeta",
+        "family": "Gjoneska"
+      },
+      {
+        "given": "Danny",
+        "family": "Leung"
+      },
+      {
+        "given": "Wei",
+        "family": "Xie"
+      },
+      {
+        "given": "R. David",
+        "family": "Hawkins"
+      },
+      {
+        "given": "Ryan",
+        "family": "Lister"
+      },
+      {
+        "given": "Chibo",
+        "family": "Hong"
+      },
+      {
+        "given": "Philippe",
+        "family": "Gascard"
+      },
+      {
+        "given": "Andrew J.",
+        "family": "Mungall"
+      },
+      {
+        "given": "Richard",
+        "family": "Moore"
+      },
+      {
+        "given": "Eric",
+        "family": "Chuah"
+      },
+      {
+        "given": "Angela",
+        "family": "Tam"
+      },
+      {
+        "given": "Theresa K.",
+        "family": "Canfield"
+      },
+      {
+        "given": "R. Scott",
+        "family": "Hansen"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "Peter J.",
+        "family": "Sabo"
+      },
+      {
+        "given": "Mukul S.",
+        "family": "Bansal"
+      },
+      {
+        "given": "Annaick",
+        "family": "Carles"
+      },
+      {
+        "given": "Jesse R.",
+        "family": "Dixon"
+      },
+      {
+        "given": "Kai-How",
+        "family": "Farh"
+      },
+      {
+        "given": "Soheil",
+        "family": "Feizi"
+      },
+      {
+        "given": "Rosa",
+        "family": "Karlic"
+      },
+      {
+        "given": "Ah-Ram",
+        "family": "Kim"
+      },
+      {
+        "given": "Ashwinikumar",
+        "family": "Kulkarni"
+      },
+      {
+        "given": "Daofeng",
+        "family": "Li"
+      },
+      {
+        "given": "Rebecca",
+        "family": "Lowdon"
+      },
+      {
+        "given": "GiNell",
+        "family": "Elliott"
+      },
+      {
+        "given": "Tim R.",
+        "family": "Mercer"
+      },
+      {
+        "given": "Shane J.",
+        "family": "Neph"
+      },
+      {
+        "given": "Vitor",
+        "family": "Onuchic"
+      },
+      {
+        "given": "Paz",
+        "family": "Polak"
+      },
+      {
+        "given": "Nisha",
+        "family": "Rajagopal"
+      },
+      {
+        "given": "Pradipta",
+        "family": "Ray"
+      },
+      {
+        "given": "Richard C.",
+        "family": "Sallari"
+      },
+      {
+        "given": "Kyle T.",
+        "family": "Siebenthall"
+      },
+      {
+        "given": "Nicholas A.",
+        "family": "Sinnott-Armstrong"
+      },
+      {
+        "given": "Michael",
+        "family": "Stevens"
+      },
+      {
+        "given": "Robert E.",
+        "family": "Thurman"
+      },
+      {
+        "given": "Jie",
+        "family": "Wu"
+      },
+      {
+        "given": "Bo",
+        "family": "Zhang"
+      },
+      {
+        "given": "Xin",
+        "family": "Zhou"
+      },
+      {
+        "given": "Arthur E.",
+        "family": "Beaudet"
+      },
+      {
+        "given": "Laurie A.",
+        "family": "Boyer"
+      },
+      {
+        "given": "Philip L.",
+        "family": "De Jager"
+      },
+      {
+        "given": "Peggy J.",
+        "family": "Farnham"
+      },
+      {
+        "given": "Susan J.",
+        "family": "Fisher"
+      },
+      {
+        "given": "David",
+        "family": "Haussler"
+      },
+      {
+        "given": "Steven J. M.",
+        "family": "Jones"
+      },
+      {
+        "given": "Wei",
+        "family": "Li"
+      },
+      {
+        "given": "Marco A.",
+        "family": "Marra"
+      },
+      {
+        "given": "Michael T.",
+        "family": "McManus"
+      },
+      {
+        "given": "Shamil",
+        "family": "Sunyaev"
+      },
+      {
+        "given": "James A.",
+        "family": "Thomson"
+      },
+      {
+        "given": "Thea D.",
+        "family": "Tlsty"
+      },
+      {
+        "given": "Li-Huei",
+        "family": "Tsai"
+      },
+      {
+        "given": "Wei",
+        "family": "Wang"
+      },
+      {
+        "given": "Robert A.",
+        "family": "Waterland"
+      },
+      {
+        "given": "Michael Q.",
+        "family": "Zhang"
+      },
+      {
+        "given": "Lisa H.",
+        "family": "Chadwick"
+      },
+      {
+        "given": "Bradley E.",
+        "family": "Bernstein"
+      },
+      {
+        "given": "Joseph F.",
+        "family": "Costello"
+      },
+      {
+        "given": "Joseph R.",
+        "family": "Ecker"
+      },
+      {
+        "given": "Martin",
+        "family": "Hirst"
+      },
+      {
+        "given": "Alexander",
+        "family": "Meissner"
+      },
+      {
+        "given": "Aleksandar",
+        "family": "Milosavljevic"
+      },
+      {
+        "given": "Bing",
+        "family": "Ren"
+      },
+      {
+        "given": "John A.",
+        "family": "Stamatoyannopoulos"
+      },
+      {
+        "given": "Ting",
+        "family": "Wang"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          2,
+          18
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f62jpn",
+    "container-title-short": "Nature",
+    "PMCID": "PMC4530010",
+    "PMID": "25693563",
+    "id": "sLkFMFZj",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature14248"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/nbt.3838",
+    "type": "article-journal",
+    "page": "319-321",
+    "source": "Crossref",
+    "title": "Reproducible RNA-seq analysis using recount2",
+    "volume": "35",
+    "author": [
+      {
+        "given": "Leonardo",
+        "family": "Collado-Torres"
+      },
+      {
+        "given": "Abhinav",
+        "family": "Nellore"
+      },
+      {
+        "given": "Kai",
+        "family": "Kammers"
+      },
+      {
+        "given": "Shannon E",
+        "family": "Ellis"
+      },
+      {
+        "given": "Margaret A",
+        "family": "Taub"
+      },
+      {
+        "given": "Kasper D",
+        "family": "Hansen"
+      },
+      {
+        "given": "Andrew E",
+        "family": "Jaffe"
+      },
+      {
+        "given": "Ben",
+        "family": "Langmead"
+      },
+      {
+        "given": "Jeffrey T",
+        "family": "Leek"
+      }
+    ],
+    "container-title": "Nature Biotechnology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75hp",
+    "container-title-short": "Nat Biotechnol",
+    "PMCID": "PMC6742427",
+    "PMID": "28398307",
+    "id": "6SPTvFXq",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nbt.3838"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/ncomms6890",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Biological interpretation of genome-wide association studies using predicted gene functions",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Tune H.",
+        "family": "Pers"
+      },
+      {},
+      {
+        "given": "Juha M.",
+        "family": "Karjalainen"
+      },
+      {
+        "given": "Yingleong",
+        "family": "Chan"
+      },
+      {
+        "given": "Harm-Jan",
+        "family": "Westra"
+      },
+      {
+        "given": "Andrew R.",
+        "family": "Wood"
+      },
+      {
+        "given": "Jian",
+        "family": "Yang"
+      },
+      {
+        "given": "Julian C.",
+        "family": "Lui"
+      },
+      {
+        "given": "Sailaja",
+        "family": "Vedantam"
+      },
+      {
+        "given": "Stefan",
+        "family": "Gustafsson"
+      },
+      {
+        "given": "Tonu",
+        "family": "Esko"
+      },
+      {
+        "given": "Tim",
+        "family": "Frayling"
+      },
+      {
+        "given": "Elizabeth K.",
+        "family": "Speliotes"
+      },
+      {
+        "given": "Michael",
+        "family": "Boehnke"
+      },
+      {
+        "given": "Soumya",
+        "family": "Raychaudhuri"
+      },
+      {
+        "given": "Rudolf S. N.",
+        "family": "Fehrmann"
+      },
+      {
+        "given": "Joel N.",
+        "family": "Hirschhorn"
+      },
+      {
+        "given": "Lude",
+        "family": "Franke"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          1,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mwhd",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC4420238",
+    "PMID": "25597830",
+    "id": "z8MQTAnJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ncomms6890"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "6",
+    "DOI": "10.1038/ng.3259",
+    "type": "article-journal",
+    "page": "569-576",
+    "source": "Crossref",
+    "title": "Understanding multicellular function and disease with human tissue-specific networks",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Casey S",
+        "family": "Greene"
+      },
+      {
+        "given": "Arjun",
+        "family": "Krishnan"
+      },
+      {
+        "given": "Aaron K",
+        "family": "Wong"
+      },
+      {
+        "given": "Emanuela",
+        "family": "Ricciotti"
+      },
+      {
+        "given": "Rene A",
+        "family": "Zelaya"
+      },
+      {
+        "given": "Daniel S",
+        "family": "Himmelstein"
+      },
+      {
+        "given": "Ran",
+        "family": "Zhang"
+      },
+      {
+        "given": "Boris M",
+        "family": "Hartmann"
+      },
+      {
+        "given": "Elena",
+        "family": "Zaslavsky"
+      },
+      {
+        "given": "Stuart C",
+        "family": "Sealfon"
+      },
+      {
+        "given": "Daniel I",
+        "family": "Chasman"
+      },
+      {
+        "given": "Garret A",
+        "family": "FitzGerald"
+      },
+      {
+        "given": "Kara",
+        "family": "Dolinski"
+      },
+      {
+        "given": "Tilo",
+        "family": "Grosser"
+      },
+      {
+        "given": "Olga G",
+        "family": "Troyanskaya"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          4,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7dvkv",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4828725",
+    "PMID": "25915600",
+    "id": "CVF61Un5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3259"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "8",
+    "DOI": "10.1038/ng.3314",
+    "type": "article-journal",
+    "page": "856-860",
+    "source": "Crossref",
+    "title": "The support of human genetic evidence for approved drug indications",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Matthew R",
+        "family": "Nelson"
+      },
+      {
+        "given": "Hannah",
+        "family": "Tipney"
+      },
+      {
+        "given": "Jeffery L",
+        "family": "Painter"
+      },
+      {
+        "given": "Judong",
+        "family": "Shen"
+      },
+      {
+        "given": "Paola",
+        "family": "Nicoletti"
+      },
+      {
+        "given": "Yufeng",
+        "family": "Shen"
+      },
+      {
+        "given": "Aris",
+        "family": "Floratos"
+      },
+      {
+        "given": "Pak Chung",
+        "family": "Sham"
+      },
+      {
+        "given": "Mulin Jun",
+        "family": "Li"
+      },
+      {
+        "given": "Junwen",
+        "family": "Wang"
+      },
+      {
+        "given": "Lon R",
+        "family": "Cardon"
+      },
+      {
+        "given": "John C",
+        "family": "Whittaker"
+      },
+      {
+        "given": "Philippe",
+        "family": "Sanseau"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          6,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mn52",
+    "container-title-short": "Nat Genet",
+    "PMID": "26121088",
+    "id": "REXpV7nA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3314"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "9",
+    "DOI": "10.1038/ng.3367",
+    "type": "article-journal",
+    "page": "1091-1098",
+    "source": "Crossref",
+    "title": "A gene-based association method for mapping traits using reference transcriptome data",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Eric R",
+        "family": "Gamazon"
+      },
+      {
+        "given": "Heather E",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Kaanan P",
+        "family": "Shah"
+      },
+      {
+        "given": "Sahar V",
+        "family": "Mozaffari"
+      },
+      {
+        "given": "Keston",
+        "family": "Aquino-Michaels"
+      },
+      {
+        "given": "Robert J",
+        "family": "Carroll"
+      },
+      {
+        "given": "Anne E",
+        "family": "Eyler"
+      },
+      {
+        "given": "Joshua C",
+        "family": "Denny"
+      },
+      {
+        "given": "Dan L",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Nancy J",
+        "family": "Cox"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          8,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7p9zv",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4552594",
+    "PMID": "26258848",
+    "id": "Z8bvDdVq",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3367"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "3",
+    "DOI": "10.1038/ng.3506",
+    "type": "article-journal",
+    "page": "245-252",
+    "source": "Crossref",
+    "title": "Integrative approaches for large-scale transcriptome-wide association studies",
+    "volume": "48",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Arthur",
+        "family": "Ko"
+      },
+      {
+        "given": "Huwenbo",
+        "family": "Shi"
+      },
+      {
+        "given": "Gaurav",
+        "family": "Bhatia"
+      },
+      {
+        "given": "Wonil",
+        "family": "Chung"
+      },
+      {
+        "given": "Brenda W J H",
+        "family": "Penninx"
+      },
+      {
+        "given": "Rick",
+        "family": "Jansen"
+      },
+      {
+        "given": "Eco J C",
+        "family": "de Geus"
+      },
+      {
+        "given": "Dorret I",
+        "family": "Boomsma"
+      },
+      {
+        "given": "Fred A",
+        "family": "Wright"
+      },
+      {
+        "given": "Patrick F",
+        "family": "Sullivan"
+      },
+      {
+        "given": "Elina",
+        "family": "Nikkola"
+      },
+      {
+        "given": "Marcus",
+        "family": "Alvarez"
+      },
+      {
+        "given": "Mete",
+        "family": "Civelek"
+      },
+      {
+        "given": "Aldons J",
+        "family": "Lusis"
+      },
+      {
+        "given": "Terho",
+        "family": "Lehtimäki"
+      },
+      {
+        "given": "Emma",
+        "family": "Raitoharju"
+      },
+      {
+        "given": "Mika",
+        "family": "Kähönen"
+      },
+      {
+        "given": "Ilkka",
+        "family": "Seppälä"
+      },
+      {
+        "given": "Olli T",
+        "family": "Raitakari"
+      },
+      {
+        "given": "Johanna",
+        "family": "Kuusisto"
+      },
+      {
+        "given": "Markku",
+        "family": "Laakso"
+      },
+      {
+        "given": "Alkes L",
+        "family": "Price"
+      },
+      {
+        "given": "Päivi",
+        "family": "Pajukanta"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          2,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3vf4p",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4767558",
+    "PMID": "26854917",
+    "id": "1D63fEEPb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3506"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7",
+    "DOI": "10.1038/ng.3570",
+    "type": "article-journal",
+    "page": "709-717",
+    "source": "Crossref",
+    "title": "Detection and interpretation of shared genetic influences on 42 human traits",
+    "volume": "48",
+    "author": [
+      {
+        "given": "Joseph K",
+        "family": "Pickrell"
+      },
+      {
+        "given": "Tomaz",
+        "family": "Berisa"
+      },
+      {
+        "given": "Jimmy Z",
+        "family": "Liu"
+      },
+      {
+        "given": "Laure",
+        "family": "Ségurel"
+      },
+      {
+        "given": "Joyce Y",
+        "family": "Tung"
+      },
+      {
+        "given": "David A",
+        "family": "Hinds"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          5,
+          16
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f8ssw4",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5207801",
+    "PMID": "27182965",
+    "id": "PDWEwciL",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3570"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "12",
+    "DOI": "10.1038/ng.3985",
+    "type": "article-journal",
+    "page": "1752-1757",
+    "source": "Crossref",
+    "title": "Shared genetic origin of asthma, hay fever and eczema elucidates allergic disease biology",
+    "volume": "49",
+    "author": [
+      {
+        "given": "Manuel A",
+        "family": "Ferreira"
+      },
+      {},
+      {
+        "given": "Judith M",
+        "family": "Vonk"
+      },
+      {
+        "given": "Hansjörg",
+        "family": "Baurecht"
+      },
+      {
+        "given": "Ingo",
+        "family": "Marenholz"
+      },
+      {
+        "given": "Chao",
+        "family": "Tian"
+      },
+      {
+        "given": "Joshua D",
+        "family": "Hoffman"
+      },
+      {
+        "given": "Quinta",
+        "family": "Helmer"
+      },
+      {
+        "given": "Annika",
+        "family": "Tillander"
+      },
+      {
+        "given": "Vilhelmina",
+        "family": "Ullemar"
+      },
+      {
+        "given": "Jenny",
+        "family": "van Dongen"
+      },
+      {
+        "given": "Yi",
+        "family": "Lu"
+      },
+      {
+        "given": "Franz",
+        "family": "Rüschendorf"
+      },
+      {
+        "given": "Jorge",
+        "family": "Esparza-Gordillo"
+      },
+      {
+        "given": "Chris W",
+        "family": "Medway"
+      },
+      {
+        "given": "Edward",
+        "family": "Mountjoy"
+      },
+      {
+        "given": "Kimberley",
+        "family": "Burrows"
+      },
+      {
+        "given": "Oliver",
+        "family": "Hummel"
+      },
+      {
+        "given": "Sarah",
+        "family": "Grosche"
+      },
+      {
+        "given": "Ben M",
+        "family": "Brumpton"
+      },
+      {
+        "given": "John S",
+        "family": "Witte"
+      },
+      {
+        "given": "Jouke-Jan",
+        "family": "Hottenga"
+      },
+      {
+        "given": "Gonneke",
+        "family": "Willemsen"
+      },
+      {
+        "given": "Jie",
+        "family": "Zheng"
+      },
+      {
+        "given": "Elke",
+        "family": "Rodríguez"
+      },
+      {
+        "given": "Melanie",
+        "family": "Hotze"
+      },
+      {
+        "given": "Andre",
+        "family": "Franke"
+      },
+      {
+        "given": "Joana A",
+        "family": "Revez"
+      },
+      {
+        "given": "Jonathan",
+        "family": "Beesley"
+      },
+      {
+        "given": "Melanie C",
+        "family": "Matheson"
+      },
+      {
+        "given": "Shyamali C",
+        "family": "Dharmage"
+      },
+      {
+        "given": "Lisa M",
+        "family": "Bain"
+      },
+      {
+        "given": "Lars G",
+        "family": "Fritsche"
+      },
+      {
+        "given": "Maiken E",
+        "family": "Gabrielsen"
+      },
+      {
+        "given": "Brunilda",
+        "family": "Balliu"
+      },
+      {
+        "given": "Jonas B",
+        "family": "Nielsen"
+      },
+      {
+        "given": "Wei",
+        "family": "Zhou"
+      },
+      {
+        "given": "Kristian",
+        "family": "Hveem"
+      },
+      {
+        "given": "Arnulf",
+        "family": "Langhammer"
+      },
+      {
+        "given": "Oddgeir L",
+        "family": "Holmen"
+      },
+      {
+        "given": "Mari",
+        "family": "Løset"
+      },
+      {
+        "given": "Gonçalo R",
+        "family": "Abecasis"
+      },
+      {
+        "given": "Cristen J",
+        "family": "Willer"
+      },
+      {
+        "given": "Andreas",
+        "family": "Arnold"
+      },
+      {
+        "given": "Georg",
+        "family": "Homuth"
+      },
+      {
+        "given": "Carsten O",
+        "family": "Schmidt"
+      },
+      {
+        "given": "Philip J",
+        "family": "Thompson"
+      },
+      {
+        "given": "Nicholas G",
+        "family": "Martin"
+      },
+      {
+        "given": "David L",
+        "family": "Duffy"
+      },
+      {
+        "given": "Natalija",
+        "family": "Novak"
+      },
+      {
+        "given": "Holger",
+        "family": "Schulz"
+      },
+      {
+        "given": "Stefan",
+        "family": "Karrasch"
+      },
+      {
+        "given": "Christian",
+        "family": "Gieger"
+      },
+      {
+        "given": "Konstantin",
+        "family": "Strauch"
+      },
+      {
+        "given": "Ronald B",
+        "family": "Melles"
+      },
+      {
+        "given": "David A",
+        "family": "Hinds"
+      },
+      {
+        "given": "Norbert",
+        "family": "Hübner"
+      },
+      {
+        "given": "Stephan",
+        "family": "Weidinger"
+      },
+      {
+        "given": "Patrik K E",
+        "family": "Magnusson"
+      },
+      {
+        "given": "Rick",
+        "family": "Jansen"
+      },
+      {
+        "given": "Eric",
+        "family": "Jorgenson"
+      },
+      {
+        "given": "Young-Ae",
+        "family": "Lee"
+      },
+      {
+        "given": "Dorret I",
+        "family": "Boomsma"
+      },
+      {
+        "given": "Catarina",
+        "family": "Almqvist"
+      },
+      {
+        "given": "Robert",
+        "family": "Karlsson"
+      },
+      {
+        "given": "Gerard H",
+        "family": "Koppelman"
+      },
+      {
+        "given": "Lavinia",
+        "family": "Paternoster"
+      },
+      {},
+      {},
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          10,
+          30
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gchg62",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5989923",
+    "PMID": "29083406",
+    "id": "LyJmyoQr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3985"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "10",
+    "DOI": "10.1038/nn.4618",
+    "type": "article-journal",
+    "page": "1342-1349",
+    "source": "Crossref",
+    "title": "Analysis of genome-wide association data highlights candidates for drug repositioning in psychiatry",
+    "volume": "20",
+    "author": [
+      {
+        "given": "Hon-Cheong",
+        "family": "So"
+      },
+      {
+        "given": "Carlos Kwan-Long",
+        "family": "Chau"
+      },
+      {
+        "given": "Wan-To",
+        "family": "Chiu"
+      },
+      {
+        "given": "Kin-Sang",
+        "family": "Ho"
+      },
+      {
+        "given": "Cho-Pong",
+        "family": "Lo"
+      },
+      {
+        "given": "Stephanie Ho-Yue",
+        "family": "Yim"
+      },
+      {
+        "given": "Pak-Chung",
+        "family": "Sham"
+      }
+    ],
+    "container-title": "Nature Neuroscience",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          8,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gbrssh",
+    "container-title-short": "Nat Neurosci",
+    "PMID": "28805813",
+    "id": "17oeJ0CXy",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nn.4618"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03424-4",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "A comprehensive evaluation of module detection methods for gene expression data",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Wouter",
+        "family": "Saelens"
+      },
+      {
+        "given": "Robrecht",
+        "family": "Cannoodt"
+      },
+      {
+        "given": "Yvan",
+        "family": "Saeys"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          3,
+          15
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gc9x36",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5854612",
+    "PMID": "29545622",
+    "id": "1BVbSrr6M",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03424-4"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03621-1",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Exploring the phenotypic consequences of tissue specific gene expression variation inferred from GWAS summary statistics",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {},
+      {
+        "given": "Scott P.",
+        "family": "Dickinson"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Jiamao",
+        "family": "Zheng"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Jason M.",
+        "family": "Torres"
+      },
+      {
+        "given": "Eric S.",
+        "family": "Torstenson"
+      },
+      {
+        "given": "Kaanan P.",
+        "family": "Shah"
+      },
+      {
+        "given": "Tzintzuni",
+        "family": "Garcia"
+      },
+      {
+        "given": "Todd L.",
+        "family": "Edwards"
+      },
+      {
+        "given": "Eli A.",
+        "family": "Stahl"
+      },
+      {
+        "given": "Laura M.",
+        "family": "Huckins"
+      },
+      {
+        "given": "Dan L.",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Nancy J.",
+        "family": "Cox"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdjvp5",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5940825",
+    "PMID": "29739930",
+    "id": "vLyTudUB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03621-1"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03751-6",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Massive mining of publicly available RNA-seq data from human and mouse",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Lachmann"
+      },
+      {
+        "given": "Denis",
+        "family": "Torre"
+      },
+      {
+        "given": "Alexandra B.",
+        "family": "Keenan"
+      },
+      {
+        "given": "Kathleen M.",
+        "family": "Jagodnik"
+      },
+      {
+        "given": "Hoyjin J.",
+        "family": "Lee"
+      },
+      {
+        "given": "Lily",
+        "family": "Wang"
+      },
+      {
+        "given": "Moshe C.",
+        "family": "Silverstein"
+      },
+      {
+        "given": "Avi",
+        "family": "Ma’ayan"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gc92dr",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5893633",
+    "PMID": "29636450",
+    "id": "Nz3IMEzd",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03751-6"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-06022-6",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Conditional and interaction gene-set analysis reveals novel functional pathways for blood pressure",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Christiaan A.",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Sven",
+        "family": "Stringer"
+      },
+      {
+        "given": "Ilona A.",
+        "family": "Dekkers"
+      },
+      {
+        "given": "Tom",
+        "family": "Heskes"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          9,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gd6d85",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC6138636",
+    "PMID": "30218068",
+    "id": "Om8ZhS06",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-06022-6"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Millions of transcriptomic profiles have been deposited in public archives, yet remain underused for the interpretation of new experiments. We present a method for interpreting new transcriptomic datasets through instant comparison to public datasets without high-performance computing requirements. We apply Principal Component Analysis on 536 studies comprising 44,890 human RNA sequencing profiles and aggregate sufficiently similar loading vectors to form Replicable Axes of Variation (RAV). RAVs are annotated with metadata of originating studies and by gene set enrichment analysis. Functionality to associate new datasets with RAVs, extract interpretable annotations, and provide intuitive visualization are implemented as the GenomicSuperSignature R/Bioconductor package. We demonstrate the efficient and coherent database search, robustness to batch effects and heterogeneous training data, and transfer learning capacity of our method using TCGA and rare diseases datasets. GenomicSuperSignature aids in analyzing new gene expression data in the context of existing databases using minimal computing resources.</jats:p>",
+    "DOI": "10.1038/s41467-022-31411-3",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "GenomicSuperSignature facilitates interpretation of RNA-seq experiments through robust, efficient comparison to public databases",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Sehyun",
+        "family": "Oh"
+      },
+      {
+        "given": "Ludwig",
+        "family": "Geistlinger"
+      },
+      {
+        "given": "Marcel",
+        "family": "Ramos"
+      },
+      {
+        "given": "Daniel",
+        "family": "Blankenberg"
+      },
+      {
+        "given": "Marius",
+        "family": "van den Beek"
+      },
+      {
+        "given": "Jaclyn N.",
+        "family": "Taroni"
+      },
+      {
+        "given": "Vincent J.",
+        "family": "Carey"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Levi",
+        "family": "Waldron"
+      },
+      {
+        "given": "Sean",
+        "family": "Davis"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2022,
+          6,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqd7hm",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC9237024",
+    "PMID": "35760813",
+    "id": "X4fhSCkz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-022-31411-3"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "3",
+    "DOI": "10.1038/s41576-019-0200-9",
+    "type": "article-journal",
+    "page": "137-150",
+    "source": "Crossref",
+    "title": "Mechanisms of tissue and cell-type specificity in heritable traits and diseases",
+    "volume": "21",
+    "author": [
+      {
+        "given": "Idan",
+        "family": "Hekselman"
+      },
+      {
+        "given": "Esti",
+        "family": "Yeger-Lotem"
+      }
+    ],
+    "container-title": "Nature Reviews Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          1,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggkx9v",
+    "container-title-short": "Nat Rev Genet",
+    "PMID": "31913361",
+    "id": "nhaocxmR",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41576-019-0200-9"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7726",
+    "DOI": "10.1038/s41586-018-0579-z",
+    "type": "article-journal",
+    "page": "203-209",
+    "source": "Crossref",
+    "title": "The UK Biobank resource with deep phenotyping and genomic data",
+    "volume": "562",
+    "author": [
+      {
+        "given": "Clare",
+        "family": "Bycroft"
+      },
+      {
+        "given": "Colin",
+        "family": "Freeman"
+      },
+      {
+        "given": "Desislava",
+        "family": "Petkova"
+      },
+      {
+        "given": "Gavin",
+        "family": "Band"
+      },
+      {
+        "given": "Lloyd T.",
+        "family": "Elliott"
+      },
+      {
+        "given": "Kevin",
+        "family": "Sharp"
+      },
+      {
+        "given": "Allan",
+        "family": "Motyer"
+      },
+      {
+        "given": "Damjan",
+        "family": "Vukcevic"
+      },
+      {
+        "given": "Olivier",
+        "family": "Delaneau"
+      },
+      {
+        "given": "Jared",
+        "family": "O’Connell"
+      },
+      {
+        "given": "Adrian",
+        "family": "Cortes"
+      },
+      {
+        "given": "Samantha",
+        "family": "Welsh"
+      },
+      {
+        "given": "Alan",
+        "family": "Young"
+      },
+      {
+        "given": "Mark",
+        "family": "Effingham"
+      },
+      {
+        "given": "Gil",
+        "family": "McVean"
+      },
+      {
+        "given": "Stephen",
+        "family": "Leslie"
+      },
+      {
+        "given": "Naomi",
+        "family": "Allen"
+      },
+      {
+        "given": "Peter",
+        "family": "Donnelly"
+      },
+      {
+        "given": "Jonathan",
+        "family": "Marchini"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfb7h2",
+    "container-title-short": "Nature",
+    "PMCID": "PMC6786975",
+    "PMID": "30305743",
+    "id": "nmJxPpE5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-018-0579-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7845",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Annotating the molecular basis of human disease remains an unsolved challenge, as 93% of disease loci are non-coding and gene-regulatory annotations are highly incomplete<jats:sup>1–3</jats:sup>. Here we present EpiMap, a compendium comprising 10,000 epigenomic maps across 800 samples, which we used to define chromatin states, high-resolution enhancers, enhancer modules, upstream regulators and downstream target genes. We used this resource to annotate 30,000 genetic loci that were associated with 540 traits<jats:sup>4</jats:sup>, predicting trait-relevant tissues, putative causal nucleotide variants in enriched tissue enhancers and candidate tissue-specific target genes for each. We partitioned multifactorial traits into tissue-specific contributing factors with distinct functional enrichments and disease comorbidity patterns, and revealed both single-factor monotropic and multifactor pleiotropic loci. Top-scoring loci frequently had multiple predicted driver variants, converging through multiple enhancers with a common target gene, multiple genes in common tissues, or multiple genes and multiple tissues, indicating extensive pleiotropy. Our results demonstrate the importance of dense, rich, high-resolution epigenomic annotations for the investigation of complex traits.</jats:p>",
+    "DOI": "10.1038/s41586-020-03145-z",
+    "type": "article-journal",
+    "page": "300-307",
+    "source": "Crossref",
+    "title": "Regulatory genomic circuitry of human disease loci by integrative epigenomics",
+    "volume": "590",
+    "author": [
+      {
+        "given": "Carles A.",
+        "family": "Boix"
+      },
+      {
+        "given": "Benjamin T.",
+        "family": "James"
+      },
+      {
+        "given": "Yongjin P.",
+        "family": "Park"
+      },
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          2,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghzkhr",
+    "container-title-short": "Nature",
+    "PMCID": "PMC7875769",
+    "PMID": "33536621",
+    "id": "xRGqPsT2",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-020-03145-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7820",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>DNase I hypersensitive sites (DHSs) are generic markers of regulatory DNA<jats:sup>1–5</jats:sup>and contain genetic variations associated with diseases and phenotypic traits<jats:sup>6–8</jats:sup>. We created high-resolution maps of DHSs from 733 human biosamples encompassing 438 cell and tissue types and states, and integrated these to delineate and numerically index approximately 3.6 million DHSs within the human genome sequence, providing a common coordinate system for regulatory DNA. Here we show that these maps highly resolve the<jats:italic>cis</jats:italic>-regulatory compartment of the human genome, which encodes unexpectedly diverse cell- and tissue-selective regulatory programs at very high density. These programs can be captured comprehensively by a simple vocabulary that enables the assignment to each DHS of a regulatory barcode that encapsulates its tissue manifestations, and global annotation of protein-coding and non-coding RNA genes in a manner orthogonal to gene expression. Finally, we show that sharply resolved DHSs markedly enhance the genetic association and heritability signals of diseases and traits. Rather than being confined to a small number of distal elements or promoters, we find that genetic signals converge on congruently regulated sets of DHSs that decorate entire gene bodies. Together, our results create a universal, extensible coordinate system and vocabulary for human regulatory DNA marked by DHSs, and provide a new global perspective on the architecture of human gene regulation.</jats:p>",
+    "DOI": "10.1038/s41586-020-2559-3",
+    "type": "article-journal",
+    "page": "244-251",
+    "source": "Crossref",
+    "title": "Index and biological spectrum of human DNase I hypersensitive sites",
+    "volume": "584",
+    "author": [
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Alexander",
+        "family": "Muratov"
+      },
+      {
+        "given": "Eric",
+        "family": "Rynes"
+      },
+      {
+        "given": "Jessica",
+        "family": "Halow"
+      },
+      {
+        "given": "Kristen",
+        "family": "Lee"
+      },
+      {
+        "given": "Daniel",
+        "family": "Bates"
+      },
+      {
+        "given": "Morgan",
+        "family": "Diegel"
+      },
+      {
+        "given": "Douglas",
+        "family": "Dunn"
+      },
+      {
+        "given": "Fidencio",
+        "family": "Neri"
+      },
+      {
+        "given": "Athanasios",
+        "family": "Teodosiadis"
+      },
+      {
+        "given": "Alex",
+        "family": "Reynolds"
+      },
+      {
+        "given": "Eric",
+        "family": "Haugen"
+      },
+      {
+        "given": "Jemma",
+        "family": "Nelson"
+      },
+      {
+        "given": "Audra",
+        "family": "Johnson"
+      },
+      {
+        "given": "Mark",
+        "family": "Frerker"
+      },
+      {
+        "given": "Michael",
+        "family": "Buckley"
+      },
+      {
+        "given": "Richard",
+        "family": "Sandstrom"
+      },
+      {
+        "given": "Jeff",
+        "family": "Vierstra"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "John",
+        "family": "Stamatoyannopoulos"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          7,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gg6dhp",
+    "container-title-short": "Nature",
+    "PMCID": "PMC7422677",
+    "PMID": "32728217",
+    "id": "1DoyZS7y0",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-020-2559-3"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-018-0081-4",
+    "type": "article-journal",
+    "page": "621-629",
+    "source": "Crossref",
+    "title": "Heritability enrichment of specifically expressed genes identifies disease-relevant tissues and cell types",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Hilary K.",
+        "family": "Finucane"
+      },
+      {
+        "given": "Yakir A.",
+        "family": "Reshef"
+      },
+      {
+        "given": "Verneri",
+        "family": "Anttila"
+      },
+      {
+        "given": "Kamil",
+        "family": "Slowikowski"
+      },
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Andrea",
+        "family": "Byrnes"
+      },
+      {
+        "given": "Steven",
+        "family": "Gazal"
+      },
+      {
+        "given": "Po-Ru",
+        "family": "Loh"
+      },
+      {
+        "given": "Caleb",
+        "family": "Lareau"
+      },
+      {
+        "given": "Noam",
+        "family": "Shoresh"
+      },
+      {
+        "given": "Giulio",
+        "family": "Genovese"
+      },
+      {
+        "given": "Arpiar",
+        "family": "Saunders"
+      },
+      {
+        "given": "Evan",
+        "family": "Macosko"
+      },
+      {
+        "given": "Samuela",
+        "family": "Pollack"
+      },
+      {
+        "given": "John R. B.",
+        "family": "Perry"
+      },
+      {
+        "given": "Jason D.",
+        "family": "Buenrostro"
+      },
+      {
+        "given": "Bradley E.",
+        "family": "Bernstein"
+      },
+      {
+        "given": "Soumya",
+        "family": "Raychaudhuri"
+      },
+      {
+        "given": "Steven",
+        "family": "McCarroll"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Alkes L.",
+        "family": "Price"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdfjqt",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5896795",
+    "PMID": "29632380",
+    "id": "WFslDIWl",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0081-4"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-018-0092-1",
+    "type": "article-journal",
+    "page": "538-548",
+    "source": "Crossref",
+    "title": "Transcriptome-wide association study of schizophrenia and chromatin activity yields mechanistic disease insights",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Mancuso"
+      },
+      {
+        "given": "Hyejung",
+        "family": "Won"
+      },
+      {
+        "given": "Maria",
+        "family": "Kousi"
+      },
+      {
+        "given": "Hilary K.",
+        "family": "Finucane"
+      },
+      {
+        "given": "Yakir",
+        "family": "Reshef"
+      },
+      {
+        "given": "Lingyun",
+        "family": "Song"
+      },
+      {
+        "given": "Alexias",
+        "family": "Safi"
+      },
+      {
+        "given": "Steven",
+        "family": "McCarroll"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Roel A.",
+        "family": "Ophoff"
+      },
+      {
+        "given": "Michael C.",
+        "family": "O’Donovan"
+      },
+      {
+        "given": "Gregory E.",
+        "family": "Crawford"
+      },
+      {
+        "given": "Daniel H.",
+        "family": "Geschwind"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Katsanis"
+      },
+      {
+        "given": "Patrick F.",
+        "family": "Sullivan"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      },
+      {
+        "given": "Alkes L.",
+        "family": "Price"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdfdf2",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5942893",
+    "PMID": "29632383",
+    "id": "AxVJwanp",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0092-1"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "6",
+    "DOI": "10.1038/s41588-018-0121-0",
+    "type": "article-journal",
+    "page": "857-864",
+    "source": "Crossref",
+    "title": "A genome-wide cross-trait analysis from UK Biobank highlights the shared genetic architecture of asthma and allergic diseases",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Zhaozhong",
+        "family": "Zhu"
+      },
+      {
+        "given": "Phil H.",
+        "family": "Lee"
+      },
+      {
+        "given": "Mark D.",
+        "family": "Chaffin"
+      },
+      {
+        "given": "Wonil",
+        "family": "Chung"
+      },
+      {
+        "given": "Po-Ru",
+        "family": "Loh"
+      },
+      {
+        "given": "Quan",
+        "family": "Lu"
+      },
+      {
+        "given": "David C.",
+        "family": "Christiani"
+      },
+      {
+        "given": "Liming",
+        "family": "Liang"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5,
+          21
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdpmtn",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5980765",
+    "PMID": "29785011",
+    "id": "veADXImD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0121-0"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-019-0385-z",
+    "type": "article-journal",
+    "page": "592-599",
+    "source": "Crossref",
+    "title": "Opportunities and challenges for transcriptome-wide association studies",
+    "volume": "51",
+    "author": [
+      {
+        "given": "Michael",
+        "family": "Wainberg"
+      },
+      {
+        "given": "Nasa",
+        "family": "Sinnott-Armstrong"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Mancuso"
+      },
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "David A.",
+        "family": "Knowles"
+      },
+      {
+        "given": "David",
+        "family": "Golan"
+      },
+      {
+        "given": "Raili",
+        "family": "Ermel"
+      },
+      {
+        "given": "Arno",
+        "family": "Ruusalepp"
+      },
+      {
+        "given": "Thomas",
+        "family": "Quertermous"
+      },
+      {
+        "given": "Ke",
+        "family": "Hao"
+      },
+      {
+        "given": "Johan L. M.",
+        "family": "Björkegren"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      },
+      {
+        "given": "Manuel A.",
+        "family": "Rivas"
+      },
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          3,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf3hmr",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC6777347",
+    "PMID": "30926968",
+    "id": "l6ogswV3",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-019-0385-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "9",
+    "DOI": "10.1038/s41588-019-0481-0",
+    "type": "article-journal",
+    "page": "1339-1348",
+    "source": "Crossref",
+    "title": "A global overview of pleiotropy and genetic architecture in complex traits",
+    "volume": "51",
+    "author": [
+      {
+        "given": "Kyoko",
+        "family": "Watanabe"
+      },
+      {
+        "given": "Sven",
+        "family": "Stringer"
+      },
+      {
+        "given": "Oleksandr",
+        "family": "Frei"
+      },
+      {
+        "given": "Maša",
+        "family": "Umićević Mirkov"
+      },
+      {
+        "given": "Christiaan",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Tinca J. C.",
+        "family": "Polderman"
+      },
+      {
+        "given": "Sophie",
+        "family": "van der Sluis"
+      },
+      {
+        "given": "Ole A.",
+        "family": "Andreassen"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          8,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggr84r",
+    "container-title-short": "Nat Genet",
+    "PMID": "31427789",
+    "id": "pZZn28he",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-019-0481-0"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7",
+    "DOI": "10.1038/s41592-019-0456-1",
+    "type": "article-journal",
+    "page": "607-610",
+    "source": "Crossref",
+    "title": "Pathway-level information extractor (PLIER) for gene expression data",
+    "volume": "16",
+    "author": [
+      {
+        "given": "Weiguang",
+        "family": "Mao"
+      },
+      {
+        "given": "Elena",
+        "family": "Zaslavsky"
+      },
+      {
+        "given": "Boris M.",
+        "family": "Hartmann"
+      },
+      {
+        "given": "Stuart C.",
+        "family": "Sealfon"
+      },
+      {
+        "given": "Maria",
+        "family": "Chikina"
+      }
+    ],
+    "container-title": "Nature Methods",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          6,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75g6",
+    "container-title-short": "Nat Methods",
+    "PMCID": "PMC7262669",
+    "PMID": "31249421",
+    "id": "Ki2ij7zE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41592-019-0456-1"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "12",
+    "DOI": "10.1038/sj.jid.5700586",
+    "type": "article-journal",
+    "page": "2637-2646",
+    "source": "Crossref",
+    "title": "Langerhans Cells Release Prostaglandin D2 in Response to Nicotinic Acid",
+    "volume": "126",
+    "author": [
+      {
+        "given": "Dominique",
+        "family": "Maciejewski-Lenoir"
+      },
+      {
+        "given": "Jeremy G.",
+        "family": "Richman"
+      },
+      {
+        "given": "Yaron",
+        "family": "Hakak"
+      },
+      {
+        "given": "Ibragim",
+        "family": "Gaidarov"
+      },
+      {
+        "given": "Dominic P.",
+        "family": "Behan"
+      },
+      {
+        "given": "Daniel T.",
+        "family": "Connolly"
+      }
+    ],
+    "container-title": "Journal of Investigative Dermatology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dgxg75",
+    "container-title-short": "Journal of Investigative Dermatology",
+    "PMID": "17008871",
+    "id": "wI0IjT3i",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/sj.jid.5700586"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/srep16882",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Mycobacterial infection induces a specific human innate immune response",
+    "volume": "5",
+    "author": [
+      {
+        "given": "John D.",
+        "family": "Blischak"
+      },
+      {
+        "given": "Ludovic",
+        "family": "Tailleux"
+      },
+      {
+        "given": "Amy",
+        "family": "Mitrano"
+      },
+      {
+        "given": "Luis B.",
+        "family": "Barreiro"
+      },
+      {
+        "given": "Yoav",
+        "family": "Gilad"
+      }
+    ],
+    "container-title": "Scientific Reports",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          11,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7zk5c",
+    "container-title-short": "Sci Rep",
+    "PMCID": "PMC4653619",
+    "PMID": "26586179",
+    "id": "1kgcHkGm",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/srep16882"
+  },
+  {
+    "publisher": "CSIRO Publishing",
+    "issue": "7",
+    "abstract": "<jats:p>\nPolycystic ovary syndrome (PCOS) is one of the most common ovarian diseases among women of reproductive age. The reproductive and metabolic traits of PCOS are underpinned by adipocyte dysfunction, especially diminished adiponectin secretion. Based on evidence that niacin stimulates adiponectin secretion, this study evaluated the effects of niacin on adiponectin concentrations and reproductive traits in a rat model of PCOS. PCOS was induced by single injection of 4mg kg−1 oestradiol valerate (i.m.), and PCOS groups were administered orally with saline or niacin (10 or 25mg kg−1) daily for 30 days after PCOS induction. The control group received 0.2mL sesame oil (i.m.) only. At the end of the experimental period, serum samples and ovaries were collected for adiponectin, histological and molecular analyses. Niacin reduced the bodyweight gain and increased ovary weights in PCOS rats. Niacin also increased the number of normal antral follicles and corpora lutea while reducing the number of cystic follicles and the thickness of theca interna. Moreover, niacin significantly increased serum adiponectin concentration and the gene expression of adiponectin and its type 1 receptor. In conclusion, this study indicates that niacin reduces cystic follicles and improves ovulation in PCOS rats. Adiponectin signalling may have contributed, in part, to the beneficial effects.\n</jats:p>",
+    "DOI": "10.1071/rd20306",
+    "type": "article-journal",
+    "page": "447",
+    "source": "Crossref",
+    "title": "Chronic niacin administration ameliorates ovulation, histological changes in the ovary and adiponectin concentrations in a rat model of polycystic ovary syndrome",
+    "volume": "33",
+    "author": [
+      {
+        "given": "Negin",
+        "family": "Asadi"
+      },
+      {
+        "given": "Mahin",
+        "family": "Izadi"
+      },
+      {
+        "given": "Ali",
+        "family": "Aflatounian"
+      },
+      {
+        "given": "Mansour",
+        "family": "Esmaeili-Dehaj"
+      },
+      {
+        "given": "Mohammad Ebrahim",
+        "family": "Rezvani"
+      },
+      {
+        "given": "Zeinab",
+        "family": "Hafizi"
+      }
+    ],
+    "container-title": "Reproduction, Fertility and Development",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpjkt",
+    "container-title-short": "Reprod. Fertil. Dev.",
+    "PMID": "33751926",
+    "id": "TovvsrDr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1071/rd20306"
+  },
+  {
+    "publisher": "Proceedings of the National Academy of Sciences",
+    "issue": "52",
+    "abstract": "<jats:p>Heritable diseases are caused by germ-line mutations that, despite tissuewide presence, often lead to tissue-specific pathology. Here, we make a systematic analysis of the link between tissue-specific gene expression and pathological manifestations in many human diseases and cancers. Diseases were systematically mapped to tissues they affect from disease-relevant literature in PubMed to create a disease–tissue covariation matrix of high-confidence associations of &gt;1,000 diseases to 73 tissues. By retrieving &gt;2,000 known disease genes, and generating 1,500 disease-associated protein complexes, we analyzed the differential expression of a gene or complex involved in a particular disease in the tissues affected by the disease, compared with nonaffected tissues. When this analysis is scaled to all diseases in our dataset, there is a significant tendency for disease genes and complexes to be overexpressed in the normal tissues where defects cause pathology. In contrast, cancer genes and complexes were not overexpressed in the tissues from which the tumors emanate. We specifically identified a complex involved in XY sex reversal that is testis-specific and down-regulated in ovaries. We also identified complexes in Parkinson disease, cardiomyopathies, and muscular dystrophy syndromes that are similarly tissue specific. Our method represents a conceptual scaffold for organism-spanning analyses and reveals an extensive list of tissue-specific draft molecular pathways, both known and unexpected, that might be disrupted in disease.</jats:p>",
+    "DOI": "10.1073/pnas.0810772105",
+    "type": "article-journal",
+    "page": "20870-20875",
+    "source": "Crossref",
+    "title": "A large-scale analysis of tissue-specific pathology and gene expression of human disease genes and complexes",
+    "volume": "105",
+    "author": [
+      {
+        "given": "Kasper",
+        "family": "Lage"
+      },
+      {
+        "given": "Niclas Tue",
+        "family": "Hansen"
+      },
+      {
+        "given": "E. Olof",
+        "family": "Karlberg"
+      },
+      {
+        "given": "Aron C.",
+        "family": "Eklund"
+      },
+      {
+        "given": "Francisco S.",
+        "family": "Roque"
+      },
+      {
+        "given": "Patricia K.",
+        "family": "Donahoe"
+      },
+      {
+        "given": "Zoltan",
+        "family": "Szallasi"
+      },
+      {
+        "given": "Thomas Skøt",
+        "family": "Jensen"
+      },
+      {
+        "given": "Søren",
+        "family": "Brunak"
+      }
+    ],
+    "container-title": "Proceedings of the National Academy of Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          12,
+          30
+        ]
+      ]
+    },
+    "URL": "https://doi.org/d5qcv9",
+    "container-title-short": "Proc. Natl. Acad. Sci. U.S.A.",
+    "PMCID": "PMC2606902",
+    "PMID": "19104045",
+    "id": "wNE0EQlN",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1073/pnas.0810772105"
+  },
+  {
+    "publisher": "IOP Publishing",
+    "DOI": "10.1088/1755-1315/31/1/012012",
+    "type": "article-journal",
+    "page": "012012",
+    "source": "Crossref",
+    "title": "Determination of Optimal Epsilon (Eps) Value on DBSCAN Algorithm to Clustering Data on Peatland Hotspots in Sumatra",
+    "volume": "31",
+    "author": [
+      {
+        "given": "Nadia",
+        "family": "Rahmah"
+      },
+      {
+        "given": "Imas Sukaesih",
+        "family": "Sitanggang"
+      }
+    ],
+    "container-title": "IOP Conference Series: Earth and Environmental Science",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqr7z2",
+    "container-title-short": "IOP Conf. Ser.: Earth Environ. Sci.",
+    "id": "FB7XPWl6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1088/1755-1315/31/1/012012"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "8",
+    "DOI": "10.1093/bioinformatics/btq099",
+    "type": "article-journal",
+    "page": "1112-1118",
+    "source": "Crossref",
+    "title": "Modeling sample variables with an Experimental Factor Ontology",
+    "volume": "26",
+    "author": [
+      {
+        "given": "James",
+        "family": "Malone"
+      },
+      {
+        "given": "Ele",
+        "family": "Holloway"
+      },
+      {
+        "given": "Tomasz",
+        "family": "Adamusiak"
+      },
+      {
+        "given": "Misha",
+        "family": "Kapushesky"
+      },
+      {
+        "given": "Jie",
+        "family": "Zheng"
+      },
+      {
+        "given": "Nikolay",
+        "family": "Kolesnikov"
+      },
+      {
+        "given": "Anna",
+        "family": "Zhukova"
+      },
+      {
+        "given": "Alvis",
+        "family": "Brazma"
+      },
+      {
+        "given": "Helen",
+        "family": "Parkinson"
+      }
+    ],
+    "container-title": "Bioinformatics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          3,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dsb6vt",
+    "PMCID": "PMC2853691",
+    "PMID": "20200009",
+    "id": "9okjVu3s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/bioinformatics/btq099"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "7",
+    "DOI": "10.1093/gigascience/giy083",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Clustering trees: a visualization for evaluating clusterings at multiple resolutions",
+    "volume": "7",
+    "author": [
+      {
+        "given": "Luke",
+        "family": "Zappia"
+      },
+      {
+        "given": "Alicia",
+        "family": "Oshlack"
+      }
+    ],
+    "container-title": "GigaScience",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          7,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfzqf5",
+    "PMCID": "PMC6057528",
+    "PMID": "30010766",
+    "id": "xhtEAzx6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/gigascience/giy083"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "D1",
+    "DOI": "10.1093/nar/gkt1068",
+    "type": "article-journal",
+    "page": "D1091-D1097",
+    "source": "Crossref",
+    "title": "DrugBank 4.0: shedding new light on drug metabolism",
+    "volume": "42",
+    "author": [
+      {
+        "given": "Vivian",
+        "family": "Law"
+      },
+      {
+        "given": "Craig",
+        "family": "Knox"
+      },
+      {
+        "given": "Yannick",
+        "family": "Djoumbou"
+      },
+      {
+        "given": "Tim",
+        "family": "Jewison"
+      },
+      {
+        "given": "An Chi",
+        "family": "Guo"
+      },
+      {
+        "given": "Yifeng",
+        "family": "Liu"
+      },
+      {
+        "given": "Adam",
+        "family": "Maciejewski"
+      },
+      {
+        "given": "David",
+        "family": "Arndt"
+      },
+      {
+        "given": "Michael",
+        "family": "Wilson"
+      },
+      {
+        "given": "Vanessa",
+        "family": "Neveu"
+      },
+      {
+        "given": "Alexandra",
+        "family": "Tang"
+      },
+      {
+        "given": "Geraldine",
+        "family": "Gabriel"
+      },
+      {
+        "given": "Carol",
+        "family": "Ly"
+      },
+      {
+        "given": "Sakina",
+        "family": "Adamjee"
+      },
+      {
+        "given": "Zerihun T.",
+        "family": "Dame"
+      },
+      {
+        "given": "Beomsoo",
+        "family": "Han"
+      },
+      {
+        "given": "You",
+        "family": "Zhou"
+      },
+      {
+        "given": "David S.",
+        "family": "Wishart"
+      }
+    ],
+    "container-title": "Nucleic Acids Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          11,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mn6d",
+    "container-title-short": "Nucl. Acids Res.",
+    "PMCID": "PMC3965102",
+    "PMID": "24203711",
+    "id": "6PR8LEXK",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/nar/gkt1068"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "D1",
+    "DOI": "10.1093/nar/gky1032",
+    "type": "article-journal",
+    "page": "D955-D962",
+    "source": "Crossref",
+    "title": "Human Disease Ontology 2018 update: classification, content and workflow expansion",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Lynn M",
+        "family": "Schriml"
+      },
+      {
+        "given": "Elvira",
+        "family": "Mitraka"
+      },
+      {
+        "given": "James",
+        "family": "Munro"
+      },
+      {
+        "given": "Becky",
+        "family": "Tauber"
+      },
+      {
+        "given": "Mike",
+        "family": "Schor"
+      },
+      {
+        "given": "Lance",
+        "family": "Nickle"
+      },
+      {
+        "given": "Victor",
+        "family": "Felix"
+      },
+      {
+        "given": "Linda",
+        "family": "Jeng"
+      },
+      {
+        "given": "Cynthia",
+        "family": "Bearer"
+      },
+      {
+        "given": "Richard",
+        "family": "Lichenstein"
+      },
+      {
+        "given": "Katharine",
+        "family": "Bisordi"
+      },
+      {
+        "given": "Nicole",
+        "family": "Campion"
+      },
+      {
+        "given": "Brooke",
+        "family": "Hyman"
+      },
+      {
+        "given": "David",
+        "family": "Kurland"
+      },
+      {
+        "given": "Connor Patrick",
+        "family": "Oates"
+      },
+      {
+        "given": "Siobhan",
+        "family": "Kibbey"
+      },
+      {
+        "given": "Poorna",
+        "family": "Sreekumar"
+      },
+      {
+        "given": "Chris",
+        "family": "Le"
+      },
+      {
+        "given": "Michelle",
+        "family": "Giglio"
+      },
+      {
+        "given": "Carol",
+        "family": "Greene"
+      }
+    ],
+    "container-title": "Nucleic Acids Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          11,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggx9wp",
+    "PMCID": "PMC6323977",
+    "PMID": "30407550",
+    "id": "1FsruosUW",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/nar/gky1032"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "12",
+    "DOI": "10.1093/qjmed/hci136",
+    "type": "article-journal",
+    "page": "845-856",
+    "source": "Crossref",
+    "title": "Reverse cholesterol transport and cholesterol efflux in atherosclerosis",
+    "volume": "98",
+    "author": [
+      {
+        "given": "R.",
+        "family": "Ohashi"
+      },
+      {
+        "given": "H.",
+        "family": "Mu"
+      },
+      {
+        "given": "X.",
+        "family": "Wang"
+      },
+      {
+        "given": "Q.",
+        "family": "Yao"
+      },
+      {
+        "given": "C.",
+        "family": "Chen"
+      }
+    ],
+    "container-title": "QJM: An International Journal of Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          10,
+          28
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dn2fgt",
+    "PMID": "16258026",
+    "id": "idlBgtFz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/qjmed/hci136"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Gene set enrichment analysis (GSEA) is an ubiquitously used tool for evaluating pathway enrichment in transcriptional data. Typical experimental design consists in comparing two conditions with several replicates using a differential gene expression test followed by preranked GSEA performed against a collection of hundreds and thousands of pathways. However, the reference implementation of this method cannot accurately estimate small P-values, which significantly limits its sensitivity due to multiple hypotheses correction procedure.</jats:p><jats:p>Here we present FGSEA (Fast Gene Set Enrichment Analysis) method that is able to estimate arbitrarily low GSEA P-values with a high accuracy in a matter of minutes or even seconds. To confirm the accuracy of the method, we also developed an exact algorithm for GSEA P-values calculation for integer gene-level statistics. Using the exact algorithm as a reference we show that FGSEA is able to routinely estimate P-values up to 10<jats:sup>−100</jats:sup> with a small and predictable estimation error. We systematically evaluate FGSEA on a collection of 605 datasets and show that FGSEA recovers much more statistically significant pathways compared to other implementations.</jats:p><jats:p>FGSEA is open source and available as an R package in Bioconductor (<jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"http://bioconductor.org/packages/fgsea/\">http://bioconductor.org/packages/fgsea/</jats:ext-link>) and on GitHub (<jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"https://github.com/ctlab/fgsea/\">https://github.com/ctlab/fgsea/</jats:ext-link>).</jats:p>",
+    "DOI": "10.1101/060012",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Fast gene set enrichment analysis",
+    "author": [
+      {
+        "given": "Gennady",
+        "family": "Korotkevich"
+      },
+      {
+        "given": "Vladimir",
+        "family": "Sukhov"
+      },
+      {
+        "given": "Nikolay",
+        "family": "Budin"
+      },
+      {
+        "given": "Boris",
+        "family": "Shpak"
+      },
+      {
+        "given": "Maxim N.",
+        "family": "Artyomov"
+      },
+      {
+        "given": "Alexey",
+        "family": "Sergushichev"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfpqhm",
+    "id": "Z8WXLD67",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/060012"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>There are currently &gt;1.3 million human –omics samples that are publicly available. This valuable resource remains acutely underused because discovering particular samples from this ever-growing data collection remains a significant challenge. The major impediment is that sample attributes are routinely described using varied terminologies written in unstructured natural language. We propose a natural-language-processing-based machine learning approach (NLP-ML) to infer tissue and cell-type annotations for –omics samples based only on their free-text metadata. NLP-ML works by creating numerical representations of sample descriptions and using these representations as features in a supervised learning classifier that predicts tissue/cell-type terms. Our approach significantly outperforms an advanced graph-based reasoning annotation method (MetaSRA) and a baseline exact string matching method (TAGGER). Model similarities between related tissues demonstrate that NLP-ML models capture biologically-meaningful signals in text. Additionally, these models correctly classify tissue-associated biological processes and diseases based on their text descriptions alone. NLP-ML models are nearly as accurate as models based on gene-expression profiles in predicting sample tissue annotations but have the distinct capability to classify samples irrespective of the –omics experiment type based on their text metadata. Python NLP-ML prediction code and trained tissue models are available at <jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"https://github.com/krishnanlab/txt2onto\">https://github.com/krishnanlab/txt2onto</jats:ext-link>.</jats:p>",
+    "DOI": "10.1101/2021.05.10.443525",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Systematic tissue annotations of –omics samples by modeling unstructured metadata",
+    "author": [
+      {
+        "given": "Nathaniel T.",
+        "family": "Hawkins"
+      },
+      {
+        "given": "Marc",
+        "family": "Maldaver"
+      },
+      {
+        "given": "Anna",
+        "family": "Yannakopoulos"
+      },
+      {
+        "given": "Lindsay A.",
+        "family": "Guare"
+      },
+      {
+        "given": "Arjun",
+        "family": "Krishnan"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          5,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj2pkc",
+    "id": "fnDaLjFy",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.05.10.443525"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>ABSTRACT</jats:title><jats:p>We present recount3, a resource consisting of over 750,000 publicly available human and mouse RNA sequencing (RNA-seq) samples uniformly processed by our new <jats:monospace>Monorail</jats:monospace> analysis pipeline. To facilitate access to the data, we provide the <jats:monospace>recount3</jats:monospace> and <jats:monospace>snapcount</jats:monospace> R/Bioconductor packages as well as complementary web resources. Using these tools, data can be downloaded as study-level summaries or queried for specific exon-exon junctions, genes, samples, or other features. <jats:monospace>Monorail</jats:monospace> can be used to process local and/or private data, allowing results to be directly compared to any study in recount3. Taken together, our tools help biologists maximize the utility of publicly available RNA-seq data, especially to improve their understanding of newly collected data. recount3 is available from <jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"http://rna.recount.bio\">http://rna.recount.bio</jats:ext-link>.</jats:p>",
+    "DOI": "10.1101/2021.05.21.445138",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "recount3: summaries and queries for large-scale RNA-seq expression and splicing",
+    "author": [
+      {
+        "given": "Christopher",
+        "family": "Wilks"
+      },
+      {
+        "given": "Shijie C.",
+        "family": "Zheng"
+      },
+      {
+        "given": "Feng Yong",
+        "family": "Chen"
+      },
+      {
+        "given": "Rone",
+        "family": "Charles"
+      },
+      {
+        "given": "Brad",
+        "family": "Solomon"
+      },
+      {
+        "given": "Jonathan P.",
+        "family": "Ling"
+      },
+      {
+        "given": "Eddie Luidy",
+        "family": "Imada"
+      },
+      {
+        "given": "David",
+        "family": "Zhang"
+      },
+      {
+        "given": "Lance",
+        "family": "Joseph"
+      },
+      {
+        "given": "Jeffrey T.",
+        "family": "Leek"
+      },
+      {
+        "given": "Andrew E.",
+        "family": "Jaffe"
+      },
+      {
+        "given": "Abhinav",
+        "family": "Nellore"
+      },
+      {
+        "given": "Leonardo",
+        "family": "Collado-Torres"
+      },
+      {
+        "given": "Kasper D.",
+        "family": "Hansen"
+      },
+      {
+        "given": "Ben",
+        "family": "Langmead"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          5,
+          23
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj7cmq",
+    "id": "TPVeG4GP",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.05.21.445138"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Understanding genetic factors of complex traits across ancestry groups holds a key to improve the overall health care quality for diverse populations in the United States. In recent years, multiple electronic health record-linked (EHR-linked) biobanks have recruited participants of diverse ancestry backgrounds; these biobanks make it possible to obtain phenome-wide association study (PheWAS) summary statistics on a genome-wide scale for different ancestry groups. Moreover, advancement in bioinformatics methods provide novel means to accelerate the translation of basic discoveries to clinical utility by integrating GWAS summary statistics and expression quantitative trait locus (eQTL) data to identify complex trait-related genes, such as transcriptome-wide association study (TWAS) and colocalization analyses. Here, we combined the advantages of multi-ancestry biobanks and data integrative approaches to investigate the multi-ancestry, gene-disease connection landscape. We first performed a phenome-wide TWAS on Electronic Medical Records and Genomics (eMERGE) III network participants of European ancestry (N = 68,813) and participants of African ancestry (N = 12,658) populations, separately. For each ancestry group, the phenome-wide TWAS tested gene-disease associations between 22,535 genes and 309 curated disease phenotypes in 49 primary human tissues, as well as cross-tissue associations. Next, we identified gene-disease associations that were shared across the two ancestry groups by combining the ancestry-specific results via meta-analyses. We further applied a Bayesian colocalization method, fastENLOC, to prioritize likely functional gene-disease associations with supportive colocalized eQTL and GWAS signals. We replicated the phenome-wide gene-disease analysis in the analogous Penn Medicine BioBank (PMBB) cohorts and sought additional validations in the PhenomeXcan UK Biobank (UKBB) database, PheWAS catalog, and systematic literature review. Phenome-wide TWAS identified many proof-of-concept gene-disease associations, e.g. <jats:italic>FTO</jats:italic>-obesity association (p = 7.29e-15), and numerous novel disease-associated genes, e.g. association between <jats:italic>GATA6-AS1</jats:italic> with pulmonary heart disease (p = 4.60e-10). In short, the multi-ancestry, gene-disease connection landscape provides rich resources for future multi-ancestry complex disease research. We also highlight the importance of expanding the size of non-European ancestry datasets and the potential of exploring ancestry-specific genetic analyses as these will be critical to improve our understanding of the genetic architecture of complex disease.</jats:p>",
+    "DOI": "10.1101/2021.10.21.21265225",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Multi-ancestry gene-trait connection landscape using electronic health record (EHR) linked biobank data",
+    "author": [
+      {
+        "given": "Binglan",
+        "family": "Li"
+      },
+      {
+        "given": "Yogasudha",
+        "family": "Veturi"
+      },
+      {
+        "given": "Anastasia",
+        "family": "Lucas"
+      },
+      {
+        "given": "Yuki",
+        "family": "Bradford"
+      },
+      {
+        "given": "Shefali S.",
+        "family": "Verma"
+      },
+      {
+        "given": "Anurag",
+        "family": "Verma"
+      },
+      {
+        "given": "Joseph",
+        "family": "Park"
+      },
+      {
+        "given": "Wei-Qi",
+        "family": "Wei"
+      },
+      {
+        "given": "Qiping",
+        "family": "Feng"
+      },
+      {
+        "given": "Bahram",
+        "family": "Namjou"
+      },
+      {
+        "given": "Krzysztof",
+        "family": "Kiryluk"
+      },
+      {
+        "given": "Iftikhar",
+        "family": "Kullo"
+      },
+      {
+        "given": "Yuan",
+        "family": "Luo"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Marylyn D.",
+        "family": "Ritchie"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          10,
+          26
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gnbdnb",
+    "id": "gZAOkumx",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.10.21.21265225"
+  },
+  {
+    "publisher": "Institute of Electrical and Electronics Engineers (IEEE)",
+    "issue": "6",
+    "DOI": "10.1109/tpami.2005.113",
+    "type": "article-journal",
+    "page": "835-850",
+    "source": "Crossref",
+    "title": "Combining multiple clusterings using evidence accumulation",
+    "volume": "27",
+    "author": [
+      {
+        "given": "Ana L.N.",
+        "family": "Fred"
+      },
+      {
+        "given": "Anil K.",
+        "family": "Jain"
+      }
+    ],
+    "container-title": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bsknv6",
+    "container-title-short": "IEEE Trans. Pattern Anal. Mach. Intell.",
+    "PMID": "15943417",
+    "id": "cuROQDFa",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1109/tpami.2005.113"
+  },
+  {
+    "publisher": "Institute of Electrical and Electronics Engineers (IEEE)",
+    "issue": "12",
+    "DOI": "10.1109/tpami.2011.84",
+    "type": "article-journal",
+    "page": "2396-2409",
+    "source": "Crossref",
+    "title": "A Link-Based Approach to the Cluster Ensemble Problem",
+    "volume": "33",
+    "author": [
+      {
+        "given": "Natthakan",
+        "family": "Iam-On"
+      },
+      {
+        "given": "Tossapon",
+        "family": "Boongoen"
+      },
+      {
+        "given": "Simon",
+        "family": "Garrett"
+      },
+      {
+        "given": "Chris",
+        "family": "Price"
+      }
+    ],
+    "container-title": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cqgkh3",
+    "container-title-short": "IEEE Trans. Pattern Anal. Mach. Intell.",
+    "PMID": "21576752",
+    "id": "rcTMvL18",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1109/tpami.2011.84"
+  },
+  {
+    "publisher": "Wiley",
+    "issue": "3",
+    "DOI": "10.1111/j.1076-7460.2007.06696.x",
+    "type": "article-journal",
+    "page": "143-149",
+    "source": "Crossref",
+    "title": "Cardiovascular Risk Factors for Alzheimer's Disease",
+    "volume": "16",
+    "author": [
+      {
+        "given": "Clive",
+        "family": "Rosendorff"
+      },
+      {
+        "given": "Michal S.",
+        "family": "Beeri"
+      },
+      {
+        "given": "Jeremy M.",
+        "family": "Silverman"
+      }
+    ],
+    "container-title": "The American Journal of Geriatric Cardiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2007,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bpfw5d",
+    "container-title-short": "Amer J Geriatric Cardiol",
+    "PMID": "17483665",
+    "id": "9BGyO071",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1111/j.1076-7460.2007.06696.x"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "37",
+    "abstract": "<jats:p>PhenomeXcan is a gene-based resource of gene-trait associations, providing biological contexts for translational research.</jats:p>",
+    "DOI": "10.1126/sciadv.aba2083",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "PhenomeXcan: Mapping the genome to the phenome through the transcriptome",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Padma S.",
+        "family": "Rajagopal"
+      },
+      {
+        "given": "Alvaro",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Owen",
+        "family": "Melia"
+      },
+      {
+        "given": "Lisa",
+        "family": "Bastarache"
+      },
+      {
+        "given": "YoSon",
+        "family": "Park"
+      },
+      {
+        "given": "GTEx",
+        "family": "Consortium"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Hae K.",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Science Advances",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghbvbf",
+    "container-title-short": "Sci. Adv.",
+    "PMID": "32917697",
+    "id": "lY5ln3dB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/sciadv.aba2083"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "6259",
+    "abstract": "<jats:title>An Aluring new autoantibody target</jats:title>\n          <jats:p>\n            Autoimmunity is the immune system's ultimate act of betrayal. Cells designed to protect against invading microbes suddenly target the host instead. In the autoimmune disease systemic lupus erythematosus, antibodies target DNA and host proteins, including the RNA binding protein Ro60. Hung\n            <jats:italic>et al.</jats:italic>\n            discovered that Ro60 bound to endogenous Alu retroelements. They detected antibody-Ro60-Alu RNA immune complexes in the blood of individuals with lupus and an enrichment of Alu transcripts. Ro60 bound to Alu probably primes RNA-binding innate immune receptors within B cells, leading these cells to make antibodies that target Ro60-Alu RNA and drive disease-causing inflammation.\n          </jats:p>\n          <jats:p>\n            <jats:italic>Science</jats:italic>\n            , this issue p.\n            <jats:related-article xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"doi\" issue=\"6259\" page=\"455\" related-article-type=\"in-this-issue\" vol=\"350\" xlink:href=\"10.1126/science.aac7442\">455</jats:related-article>\n          </jats:p>",
+    "DOI": "10.1126/science.aac7442",
+    "type": "article-journal",
+    "page": "455-459",
+    "source": "Crossref",
+    "title": "The Ro60 autoantigen binds endogenous retroelements and regulates inflammatory gene expression",
+    "volume": "350",
+    "author": [
+      {
+        "given": "T.",
+        "family": "Hung"
+      },
+      {
+        "given": "G. A.",
+        "family": "Pratt"
+      },
+      {
+        "given": "B.",
+        "family": "Sundararaman"
+      },
+      {
+        "given": "M. J.",
+        "family": "Townsend"
+      },
+      {
+        "given": "C.",
+        "family": "Chaivorapol"
+      },
+      {
+        "given": "T.",
+        "family": "Bhangale"
+      },
+      {
+        "given": "R. R.",
+        "family": "Graham"
+      },
+      {
+        "given": "W.",
+        "family": "Ortmann"
+      },
+      {
+        "given": "L. A.",
+        "family": "Criswell"
+      },
+      {
+        "given": "G. W.",
+        "family": "Yeo"
+      },
+      {
+        "given": "T. W.",
+        "family": "Behrens"
+      }
+    ],
+    "container-title": "Science",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          10,
+          23
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7vs67",
+    "container-title-short": "Science",
+    "PMCID": "PMC4691329",
+    "PMID": "26382853",
+    "id": "EnoqU4ga",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/science.aac7442"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "6509",
+    "abstract": "<jats:p>The Genotype-Tissue Expression (GTEx) project dissects how genetic variation affects gene expression and splicing.</jats:p>",
+    "DOI": "10.1126/science.aaz1776",
+    "type": "article-journal",
+    "page": "1318-1330",
+    "source": "Crossref",
+    "title": "The GTEx Consortium atlas of genetic regulatory effects across human tissues",
+    "volume": "369",
+    "author": [
+      {},
+      {
+        "given": "François",
+        "family": "Aguet"
+      },
+      {
+        "given": "Shankara",
+        "family": "Anand"
+      },
+      {
+        "given": "Kristin G.",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Stacey",
+        "family": "Gabriel"
+      },
+      {
+        "given": "Gad A.",
+        "family": "Getz"
+      },
+      {
+        "given": "Aaron",
+        "family": "Graubert"
+      },
+      {
+        "given": "Kane",
+        "family": "Hadley"
+      },
+      {
+        "given": "Robert E.",
+        "family": "Handsaker"
+      },
+      {
+        "given": "Katherine H.",
+        "family": "Huang"
+      },
+      {
+        "given": "Seva",
+        "family": "Kashin"
+      },
+      {
+        "given": "Xiao",
+        "family": "Li"
+      },
+      {
+        "given": "Daniel G.",
+        "family": "MacArthur"
+      },
+      {
+        "given": "Samuel R.",
+        "family": "Meier"
+      },
+      {
+        "given": "Jared L.",
+        "family": "Nedzel"
+      },
+      {
+        "given": "Duyen T.",
+        "family": "Nguyen"
+      },
+      {
+        "given": "Ayellet V.",
+        "family": "Segrè"
+      },
+      {
+        "given": "Ellen",
+        "family": "Todres"
+      },
+      {
+        "given": "Brunilda",
+        "family": "Balliu"
+      },
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Alexis",
+        "family": "Battle"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Andrew",
+        "family": "Brown"
+      },
+      {
+        "given": "Christopher D.",
+        "family": "Brown"
+      },
+      {
+        "given": "Stephane E.",
+        "family": "Castel"
+      },
+      {
+        "given": "Donald F.",
+        "family": "Conrad"
+      },
+      {
+        "given": "Daniel J.",
+        "family": "Cotter"
+      },
+      {
+        "given": "Nancy",
+        "family": "Cox"
+      },
+      {
+        "given": "Sayantan",
+        "family": "Das"
+      },
+      {
+        "given": "Olivia M.",
+        "family": "de Goede"
+      },
+      {
+        "given": "Emmanouil T.",
+        "family": "Dermitzakis"
+      },
+      {
+        "given": "Jonah",
+        "family": "Einson"
+      },
+      {
+        "given": "Barbara E.",
+        "family": "Engelhardt"
+      },
+      {
+        "given": "Eleazar",
+        "family": "Eskin"
+      },
+      {
+        "given": "Tiffany Y.",
+        "family": "Eulalio"
+      },
+      {
+        "given": "Nicole M.",
+        "family": "Ferraro"
+      },
+      {
+        "given": "Elise D.",
+        "family": "Flynn"
+      },
+      {
+        "given": "Laure",
+        "family": "Fresard"
+      },
+      {
+        "given": "Eric R.",
+        "family": "Gamazon"
+      },
+      {
+        "given": "Diego",
+        "family": "Garrido-Martín"
+      },
+      {
+        "given": "Nicole R.",
+        "family": "Gay"
+      },
+      {
+        "given": "Michael J.",
+        "family": "Gloudemans"
+      },
+      {
+        "given": "Roderic",
+        "family": "Guigó"
+      },
+      {
+        "given": "Andrew R.",
+        "family": "Hame"
+      },
+      {
+        "given": "Yuan",
+        "family": "He"
+      },
+      {
+        "given": "Paul J.",
+        "family": "Hoffman"
+      },
+      {
+        "given": "Farhad",
+        "family": "Hormozdiari"
+      },
+      {
+        "given": "Lei",
+        "family": "Hou"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Brian",
+        "family": "Jo"
+      },
+      {
+        "given": "Silva",
+        "family": "Kasela"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      },
+      {
+        "given": "Sarah",
+        "family": "Kim-Hellmuth"
+      },
+      {
+        "given": "Alan",
+        "family": "Kwong"
+      },
+      {
+        "given": "Tuuli",
+        "family": "Lappalainen"
+      },
+      {
+        "given": "Xin",
+        "family": "Li"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Serghei",
+        "family": "Mangul"
+      },
+      {
+        "given": "Pejman",
+        "family": "Mohammadi"
+      },
+      {
+        "given": "Stephen B.",
+        "family": "Montgomery"
+      },
+      {
+        "given": "Manuel",
+        "family": "Muñoz-Aguirre"
+      },
+      {
+        "given": "Daniel C.",
+        "family": "Nachun"
+      },
+      {
+        "given": "Andrew B.",
+        "family": "Nobel"
+      },
+      {
+        "given": "Meritxell",
+        "family": "Oliva"
+      },
+      {
+        "given": "YoSon",
+        "family": "Park"
+      },
+      {
+        "given": "Yongjin",
+        "family": "Park"
+      },
+      {
+        "given": "Princy",
+        "family": "Parsana"
+      },
+      {
+        "given": "Abhiram S.",
+        "family": "Rao"
+      },
+      {
+        "given": "Ferran",
+        "family": "Reverter"
+      },
+      {
+        "given": "John M.",
+        "family": "Rouhana"
+      },
+      {
+        "given": "Chiara",
+        "family": "Sabatti"
+      },
+      {
+        "given": "Ashis",
+        "family": "Saha"
+      },
+      {
+        "given": "Matthew",
+        "family": "Stephens"
+      },
+      {
+        "given": "Barbara E.",
+        "family": "Stranger"
+      },
+      {
+        "given": "Benjamin J.",
+        "family": "Strober"
+      },
+      {
+        "given": "Nicole A.",
+        "family": "Teran"
+      },
+      {
+        "given": "Ana",
+        "family": "Viñuela"
+      },
+      {
+        "given": "Gao",
+        "family": "Wang"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Fred",
+        "family": "Wright"
+      },
+      {
+        "given": "Valentin",
+        "family": "Wucher"
+      },
+      {
+        "given": "Yuxin",
+        "family": "Zou"
+      },
+      {
+        "given": "Pedro G.",
+        "family": "Ferreira"
+      },
+      {
+        "given": "Gen",
+        "family": "Li"
+      },
+      {
+        "given": "Marta",
+        "family": "Melé"
+      },
+      {
+        "given": "Esti",
+        "family": "Yeger-Lotem"
+      },
+      {
+        "given": "Mary E.",
+        "family": "Barcus"
+      },
+      {
+        "given": "Debra",
+        "family": "Bradbury"
+      },
+      {
+        "given": "Tanya",
+        "family": "Krubit"
+      },
+      {
+        "given": "Jeffrey A.",
+        "family": "McLean"
+      },
+      {
+        "given": "Liqun",
+        "family": "Qi"
+      },
+      {
+        "given": "Karna",
+        "family": "Robinson"
+      },
+      {
+        "given": "Nancy V.",
+        "family": "Roche"
+      },
+      {
+        "given": "Anna M.",
+        "family": "Smith"
+      },
+      {
+        "given": "Leslie",
+        "family": "Sobin"
+      },
+      {
+        "given": "David E.",
+        "family": "Tabor"
+      },
+      {
+        "given": "Anita",
+        "family": "Undale"
+      },
+      {
+        "given": "Jason",
+        "family": "Bridge"
+      },
+      {
+        "given": "Lori E.",
+        "family": "Brigham"
+      },
+      {
+        "given": "Barbara A.",
+        "family": "Foster"
+      },
+      {
+        "given": "Bryan M.",
+        "family": "Gillard"
+      },
+      {
+        "given": "Richard",
+        "family": "Hasz"
+      },
+      {
+        "given": "Marcus",
+        "family": "Hunter"
+      },
+      {
+        "given": "Christopher",
+        "family": "Johns"
+      },
+      {
+        "given": "Mark",
+        "family": "Johnson"
+      },
+      {
+        "given": "Ellen",
+        "family": "Karasik"
+      },
+      {
+        "given": "Gene",
+        "family": "Kopen"
+      },
+      {
+        "given": "William F.",
+        "family": "Leinweber"
+      },
+      {
+        "given": "Alisa",
+        "family": "McDonald"
+      },
+      {
+        "given": "Michael T.",
+        "family": "Moser"
+      },
+      {
+        "given": "Kevin",
+        "family": "Myer"
+      },
+      {
+        "given": "Kimberley D.",
+        "family": "Ramsey"
+      },
+      {
+        "given": "Brian",
+        "family": "Roe"
+      },
+      {
+        "given": "Saboor",
+        "family": "Shad"
+      },
+      {
+        "given": "Jeffrey A.",
+        "family": "Thomas"
+      },
+      {
+        "given": "Gary",
+        "family": "Walters"
+      },
+      {
+        "given": "Michael",
+        "family": "Washington"
+      },
+      {
+        "given": "Joseph",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Scott D.",
+        "family": "Jewell"
+      },
+      {
+        "given": "Daniel C.",
+        "family": "Rohrer"
+      },
+      {
+        "given": "Dana R.",
+        "family": "Valley"
+      },
+      {
+        "given": "David A.",
+        "family": "Davis"
+      },
+      {
+        "given": "Deborah C.",
+        "family": "Mash"
+      },
+      {
+        "given": "Philip A.",
+        "family": "Branton"
+      },
+      {
+        "given": "Laura K.",
+        "family": "Barker"
+      },
+      {
+        "given": "Heather M.",
+        "family": "Gardiner"
+      },
+      {
+        "given": "Maghboeba",
+        "family": "Mosavel"
+      },
+      {
+        "given": "Laura A.",
+        "family": "Siminoff"
+      },
+      {
+        "given": "Paul",
+        "family": "Flicek"
+      },
+      {
+        "given": "Maximilian",
+        "family": "Haeussler"
+      },
+      {
+        "given": "Thomas",
+        "family": "Juettemann"
+      },
+      {
+        "given": "W. James",
+        "family": "Kent"
+      },
+      {
+        "given": "Christopher M.",
+        "family": "Lee"
+      },
+      {
+        "given": "Conner C.",
+        "family": "Powell"
+      },
+      {
+        "given": "Kate R.",
+        "family": "Rosenbloom"
+      },
+      {
+        "given": "Magali",
+        "family": "Ruffier"
+      },
+      {
+        "given": "Dan",
+        "family": "Sheppard"
+      },
+      {
+        "given": "Kieron",
+        "family": "Taylor"
+      },
+      {
+        "given": "Stephen J.",
+        "family": "Trevanion"
+      },
+      {
+        "given": "Daniel R.",
+        "family": "Zerbino"
+      },
+      {
+        "given": "Nathan S.",
+        "family": "Abell"
+      },
+      {
+        "given": "Joshua",
+        "family": "Akey"
+      },
+      {
+        "given": "Lin",
+        "family": "Chen"
+      },
+      {
+        "given": "Kathryn",
+        "family": "Demanelis"
+      },
+      {
+        "given": "Jennifer A.",
+        "family": "Doherty"
+      },
+      {
+        "given": "Andrew P.",
+        "family": "Feinberg"
+      },
+      {
+        "given": "Kasper D.",
+        "family": "Hansen"
+      },
+      {
+        "given": "Peter F.",
+        "family": "Hickey"
+      },
+      {
+        "given": "Farzana",
+        "family": "Jasmine"
+      },
+      {
+        "given": "Lihua",
+        "family": "Jiang"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "Muhammad G.",
+        "family": "Kibriya"
+      },
+      {
+        "given": "Jin Billy",
+        "family": "Li"
+      },
+      {
+        "given": "Qin",
+        "family": "Li"
+      },
+      {
+        "given": "Shin",
+        "family": "Lin"
+      },
+      {
+        "given": "Sandra E.",
+        "family": "Linder"
+      },
+      {
+        "given": "Brandon L.",
+        "family": "Pierce"
+      },
+      {
+        "given": "Lindsay F.",
+        "family": "Rizzardi"
+      },
+      {
+        "given": "Andrew D.",
+        "family": "Skol"
+      },
+      {
+        "given": "Kevin S.",
+        "family": "Smith"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      },
+      {
+        "given": "John",
+        "family": "Stamatoyannopoulos"
+      },
+      {
+        "given": "Hua",
+        "family": "Tang"
+      },
+      {
+        "given": "Meng",
+        "family": "Wang"
+      },
+      {
+        "given": "Latarsha J.",
+        "family": "Carithers"
+      },
+      {
+        "given": "Ping",
+        "family": "Guan"
+      },
+      {
+        "given": "Susan E.",
+        "family": "Koester"
+      },
+      {
+        "given": "A. Roger",
+        "family": "Little"
+      },
+      {
+        "given": "Helen M.",
+        "family": "Moore"
+      },
+      {
+        "given": "Concepcion R.",
+        "family": "Nierras"
+      },
+      {
+        "given": "Abhi K.",
+        "family": "Rao"
+      },
+      {
+        "given": "Jimmie B.",
+        "family": "Vaught"
+      },
+      {
+        "given": "Simona",
+        "family": "Volpi"
+      }
+    ],
+    "container-title": "Science",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghbnhr",
+    "container-title-short": "Science",
+    "PMCID": "PMC7737656",
+    "PMID": "32913098",
+    "id": "9Pr9idng",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/science.aaz1776"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "432",
+    "abstract": "<jats:p>Properly applied, clustering methods reveal meaning in high-throughput biological data.</jats:p>",
+    "DOI": "10.1126/scisignal.aad1932",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Avoiding common pitfalls when clustering biological data",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Tom",
+        "family": "Ronan"
+      },
+      {
+        "given": "Zhijie",
+        "family": "Qi"
+      },
+      {
+        "given": "Kristen M.",
+        "family": "Naegle"
+      }
+    ],
+    "container-title": "Science Signaling",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcvjr6",
+    "container-title-short": "Sci. Signal.",
+    "PMID": "27303057",
+    "id": "14dCeRkua",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scisignal.aad1932"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "96",
+    "abstract": "<jats:p>A systematic computational method predicts new uses for existing drugs by integrating public gene expression signatures of drugs and diseases.</jats:p>",
+    "DOI": "10.1126/scitranslmed.3001318",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Discovery and Preclinical Validation of Drug Indications Using Compendia of Public Gene Expression Data",
+    "volume": "3",
+    "author": [
+      {
+        "given": "Marina",
+        "family": "Sirota"
+      },
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      },
+      {
+        "given": "Jeewon",
+        "family": "Kim"
+      },
+      {
+        "given": "Annie P.",
+        "family": "Chiang"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Morgan"
+      },
+      {
+        "given": "Alejandro",
+        "family": "Sweet-Cordero"
+      },
+      {
+        "given": "Julien",
+        "family": "Sage"
+      },
+      {
+        "given": "Atul J.",
+        "family": "Butte"
+      }
+    ],
+    "container-title": "Science Translational Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          8,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/c3fwxv",
+    "container-title-short": "Sci. Transl. Med.",
+    "PMCID": "PMC3502016",
+    "PMID": "21849665",
+    "id": "mZjkE1xU",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scitranslmed.3001318"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "96",
+    "abstract": "<jats:p>Computationally predicted repositioning of an anticonvulsant for inflammatory bowel disease is confirmed experimentally.</jats:p>",
+    "DOI": "10.1126/scitranslmed.3002648",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Computational Repositioning of the Anticonvulsant Topiramate for Inflammatory Bowel Disease",
+    "volume": "3",
+    "author": [
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      },
+      {
+        "given": "Marina",
+        "family": "Sirota"
+      },
+      {
+        "given": "Mohan",
+        "family": "Shenoy"
+      },
+      {
+        "given": "Reetesh K.",
+        "family": "Pai"
+      },
+      {
+        "given": "Silke",
+        "family": "Roedder"
+      },
+      {
+        "given": "Annie P.",
+        "family": "Chiang"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Morgan"
+      },
+      {
+        "given": "Minnie M.",
+        "family": "Sarwal"
+      },
+      {
+        "given": "Pankaj Jay",
+        "family": "Pasricha"
+      },
+      {
+        "given": "Atul J.",
+        "family": "Butte"
+      }
+    ],
+    "container-title": "Science Translational Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          8,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bmh5ts",
+    "container-title-short": "Sci. Transl. Med.",
+    "PMCID": "PMC3479650",
+    "PMID": "21849664",
+    "id": "1ClBKizD7",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scitranslmed.3002648"
+  },
+  {
+    "publisher": "S. Karger AG",
+    "issue": "6",
+    "abstract": "<jats:p>Background/Aims: Over 99% of mouse and human ovarian follicles will undergo specialized cell death including atresia and apoptosis. Reduction of apoptosis may help reduce infertility and maintain the reproductive ability in women. Methods: 3-day B6D2F1 mice were used to culture small follicle and ovary tissue with niacin and 18-day mice were intraperitoneal injected with niacin to determine its effect on follicle development. Then establish 8-weeks POF animal model with cytoxan (CTX) or radiation. Treatment group was given 0.1 mL of 100 mM niacin by an intraperitoneal injection twice before ovulation. The ovaries were collected and the follicles were counted and categorized, and ovarian histologic sections were stained for TUNEL. Ovarian function was then evaluated by monitoring ovulation. Microarray analyses, Western blot, immunofluorescence and real-time quantitative PCR were used to assess the mechanism of ovarian injury and repair. Results: We found that niacin promotes follicle growth in the immature oocyte and it increased the levels of a germ-line cell marker DDX4, and a cell proliferation marker PCNA in the ovary. Addition of niacin to the cell culture reduced oocyte apoptosis in vitro. Administration of niacin to treat premature ovarian failure (POF) in mouse models showed inhibition of follicular apoptosis under harmful conditions, such as radiation and chemotherapy damage, by markedly reducing cumulus cell apoptosis. Additionally, the number of developing follicles increased after administration of niacin. Conclusion: Niacin may have an important function in treating POF by reducing apoptosis in clinical applications.</jats:p>",
+    "DOI": "10.1159/000495051",
+    "type": "article-journal",
+    "page": "2060-2070",
+    "source": "Crossref",
+    "title": "Niacin Inhibits Apoptosis and Rescues Premature Ovarian Failure",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Shufang",
+        "family": "Wang"
+      },
+      {
+        "given": "Min",
+        "family": "Sun"
+      },
+      {
+        "given": "Ling",
+        "family": "Yu"
+      },
+      {
+        "given": "Yixuan",
+        "family": "Wang"
+      },
+      {
+        "given": "Yuanqing",
+        "family": "Yao"
+      },
+      {
+        "given": "Deqing",
+        "family": "Wang"
+      }
+    ],
+    "container-title": "Cellular Physiology and Biochemistry",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfqvcq",
+    "container-title-short": "Cell Physiol Biochem",
+    "PMID": "30415247",
+    "id": "kLRErKXz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1159/000495051"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "2",
+    "abstract": "<jats:p>Atherosclerosis has been characterized as a chronic inflammatory response to cholesterol deposition in arteries, but the mechanisms linking cholesterol accumulation in macrophage foam cells to inflammation are poorly understood. Macrophage cholesterol efflux occurs at all stages of atherosclerosis and protects cells from free cholesterol and oxysterol-induced toxicity. The ATP-binding cassette transporters ABCA1 and ABCG1 are responsible for the major part of macrophage cholesterol efflux to serum or HDL in macrophage foam cells, but other less efficient pathways such as passive efflux are also involved. Recent studies have shown that the sterol efflux activities of ABCA1 and ABCG1 modulate macrophage expression of inflammatory cytokines and chemokines as well as lymphocyte proliferative responses. In macrophages, transporter deficiency causes increased signaling via various Toll-like receptors including TLR4. These studies have shown that the traditional roles of HDL and ABC transporters in cholesterol efflux and reverse cholesterol transport are mechanistically linked to antiinflammatory and immunosuppressive functions of HDL. The underlying mechanisms may involve modulation of sterol levels and lipid organization in cell membranes.</jats:p>",
+    "DOI": "10.1161/atvbaha.108.179283",
+    "type": "article-journal",
+    "page": "139-143",
+    "source": "Crossref",
+    "title": "Role of HDL, ABCA1, and ABCG1 Transporters in Cholesterol Efflux and Immune Responses",
+    "volume": "30",
+    "author": [
+      {
+        "given": "Laurent",
+        "family": "Yvan-Charvet"
+      },
+      {
+        "given": "Nan",
+        "family": "Wang"
+      },
+      {
+        "given": "Alan R.",
+        "family": "Tall"
+      }
+    ],
+    "container-title": "Arteriosclerosis, Thrombosis, and Vascular Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          2
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ds23w6",
+    "container-title-short": "ATVB",
+    "PMCID": "PMC2812788",
+    "PMID": "19797709",
+    "id": "1DblG8swn",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/atvbaha.108.179283"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "1",
+    "abstract": "<jats:p>In a somewhat narrow diagnostic lens, Alzheimer disease (AD) has been considered a brain-specific disease characterized by the presence of Aβ (β-amyloid) plaques and tau neural fibrillary tangles and neural inflammation; these pathologies lead to neuronal death and consequently clinical symptoms, such as memory loss, confusion, and impaired cognitive function. However, for decades, researchers have noticed a link between various cardiovascular abnormalities and AD—such as heart failure, coronary artery disease, atrial fibrillation, and vasculopathy. A considerable volume of work has pointed at this head to heart connection, focusing mainly on associations between cerebral hypoperfusion and neuronal degradation. However, new evidence of a possible systemic or metastatic profile to AD calls for further analysis of this connection. Aβ aggregations—biochemically and structurally akin to those found in the typical AD pathology—are now known to be present in the hearts of individuals with idiopathic dilated cardiomyopathy, as well as the hearts of patients with AD. These findings suggest a potential systemic profile of proteinopathies and a new hypothesis for the link between peripheral and central symptoms of heart failure and AD. Herein, we provide an overview of the cardiovascular links to Alzheimer disease.</jats:p>",
+    "DOI": "10.1161/circresaha.118.313563",
+    "type": "article-journal",
+    "page": "142-149",
+    "source": "Crossref",
+    "title": "Getting to the Heart of Alzheimer Disease",
+    "volume": "124",
+    "author": [
+      {
+        "given": "Joshua M.",
+        "family": "Tublin"
+      },
+      {
+        "given": "Jeremy M.",
+        "family": "Adelstein"
+      },
+      {
+        "given": "Federica",
+        "family": "del Monte"
+      },
+      {
+        "given": "Colin K.",
+        "family": "Combs"
+      },
+      {
+        "given": "Loren E.",
+        "family": "Wold"
+      }
+    ],
+    "container-title": "Circulation Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          1,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjzjgq",
+    "container-title-short": "Circ Res",
+    "PMCID": "PMC6319653",
+    "PMID": "30605407",
+    "id": "13t4TuFeJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/circresaha.118.313563"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "2",
+    "abstract": "<jats:sec>\n            <jats:title>Background—</jats:title>\n            <jats:p>Depression, anxiety, and psychotic disorders have been associated with an increased risk of coronary heart disease (CHD). It is unclear whether this association between mental health and CHD is present across a wider range of mental disorders.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Methods and Results—</jats:title>\n            <jats:p>Participants were 1 107 524 Swedish men conscripted at a mean age of 18.3 years. Mental disorders were assessed by psychiatric interview on conscription, and data on hospital admissions for mental disorder and CHD were obtained from national registers during 22.6 years of follow-up. An increased risk of incident CHD was evident across a range of mental disorders whether diagnosed at conscription or on later hospital admission. Age-adjusted hazard ratios (95% confidence intervals) according to diagnoses at conscription ranged from 1.30 (1.05–1.62) (depressive disorders) to 1.90 (1.58–2.28) (alcohol-related disorders). The equivalent figures according to diagnoses during hospital admission ranged from 1.49 (1.24–1.80) (schizophrenia) to 2.82 (2.53–3.13) (other substance use disorders). Associations were little changed by adjustment for parental socioeconomic status, or body mass index, diabetes mellitus, and blood pressure measured at conscription, but they were partially attenuated by the adjustment for smoking, alcohol intake, and intelligence measured at conscription, and for education and own socioeconomic position.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Conclusions—</jats:title>\n            <jats:p>Increased risk of incident CHD is present across a range of mental disorders and is observable when the disorders are diagnosed at a young age.</jats:p>\n          </jats:sec>",
+    "DOI": "10.1161/circulationaha.113.002065",
+    "type": "article-journal",
+    "page": "186-193",
+    "source": "Crossref",
+    "title": "Mental Disorders Across the Adult Life Course and Future Coronary Heart Disease",
+    "volume": "129",
+    "author": [
+      {
+        "given": "Catharine R.",
+        "family": "Gale"
+      },
+      {
+        "given": "G. David",
+        "family": "Batty"
+      },
+      {
+        "given": "David P. J.",
+        "family": "Osborn"
+      },
+      {
+        "given": "Per",
+        "family": "Tynelius"
+      },
+      {
+        "given": "Finn",
+        "family": "Rasmussen"
+      }
+    ],
+    "container-title": "Circulation",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          1,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/qm4",
+    "container-title-short": "Circulation",
+    "PMCID": "PMC4107269",
+    "PMID": "24190959",
+    "id": "j2Sl4DAE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/circulationaha.113.002065"
+  },
+  {
+    "publisher": "American Society for Clinical Investigation",
+    "issue": "3",
+    "DOI": "10.1172/jci41651",
+    "type": "article-journal",
+    "page": "1163-1173",
+    "source": "Crossref",
+    "title": "Nicotinic acid inhibits progression of atherosclerosis in mice through its receptor GPR109A expressed by immune cells",
+    "volume": "121",
+    "author": [
+      {
+        "given": "Martina",
+        "family": "Lukasova"
+      },
+      {
+        "given": "Camille",
+        "family": "Malaval"
+      },
+      {
+        "given": "Andreas",
+        "family": "Gille"
+      },
+      {
+        "given": "Jukka",
+        "family": "Kero"
+      },
+      {
+        "given": "Stefan",
+        "family": "Offermanns"
+      }
+    ],
+    "container-title": "Journal of Clinical Investigation",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          3,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cqftcq",
+    "container-title-short": "J. Clin. Invest.",
+    "PMCID": "PMC3048854",
+    "PMID": "21317532",
+    "id": "1Bz0jRHYo",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1172/jci41651"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title>\n          <jats:sec>\n            <jats:title>Background</jats:title>\n            <jats:p>Accurate evaluation of the quality of genomic or proteomic data and computational methods is vital to our ability to use them for formulating novel biological hypotheses and directing further experiments. There is currently no standard approach to evaluation in functional genomics. Our analysis of existing approaches shows that they are inconsistent and contain substantial functional biases that render the resulting evaluations misleading both quantitatively and qualitatively. These problems make it essentially impossible to compare computational methods or large-scale experimental datasets and also result in conclusions that generalize poorly in most biological applications.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Results</jats:title>\n            <jats:p>We reveal issues with current evaluation methods here and suggest new approaches to evaluation that facilitate accurate and representative characterization of genomic methods and data. Specifically, we describe a functional genomics gold standard based on curation by expert biologists and demonstrate its use as an effective means of evaluation of genomic approaches. Our evaluation framework and gold standard are freely available to the community through our website.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Conclusion</jats:title>\n            <jats:p>Proper methods for evaluating genomic data and computational approaches will determine how much we, as a community, are able to learn from the wealth of available data. We propose one possible solution to this problem here but emphasize that this topic warrants broader community discussion.</jats:p>\n          </jats:sec>",
+    "DOI": "10.1186/1471-2164-7-187",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Finding function: evaluation methods for functional genomic data",
+    "volume": "7",
+    "author": [
+      {
+        "given": "Chad L",
+        "family": "Myers"
+      },
+      {
+        "given": "Daniel R",
+        "family": "Barrett"
+      },
+      {
+        "given": "Matthew A",
+        "family": "Hibbs"
+      },
+      {
+        "given": "Curtis",
+        "family": "Huttenhower"
+      },
+      {
+        "given": "Olga G",
+        "family": "Troyanskaya"
+      }
+    ],
+    "container-title": "BMC Genomics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          7,
+          25
+        ]
+      ]
+    },
+    "URL": "https://doi.org/fg6wnk",
+    "container-title-short": "BMC Genomics",
+    "PMCID": "PMC1560386",
+    "PMID": "16869964",
+    "id": "1FVd2WW6G",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/1471-2164-7-187"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1186/s12916-014-0206-2",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "The overlap between vascular disease and Alzheimer’s disease - lessons from pathology",
+    "volume": "12",
+    "author": [
+      {
+        "given": "Johannes",
+        "family": "Attems"
+      },
+      {
+        "given": "Kurt A",
+        "family": "Jellinger"
+      }
+    ],
+    "container-title": "BMC Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          11,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f6pjd4",
+    "container-title-short": "BMC Med",
+    "PMCID": "PMC4226890",
+    "PMID": "25385447",
+    "id": "D83Aqhga",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s12916-014-0206-2"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>Mapping disease-associated genetic variants to complex disease pathophysiology is a major challenge in translating findings from genome-wide association studies into novel therapeutic opportunities. The difficulty lies in our limited understanding of how phenotypic traits arise from non-coding genetic variants in highly organized biological systems with heterogeneous gene expression across cells and tissues.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>We present a novel strategy, called GWAS component analysis, for transferring disease associations from single-nucleotide polymorphisms to co-expression modules by stacking models trained using reference genome and tissue-specific gene expression data. Application of this method to genome-wide association studies of blood cell counts confirmed that it could detect gene sets enriched in expected cell types. In addition, coupling of our method with Bayesian networks enables GWAS components to be used to discover drug targets.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusions</jats:title>\n                <jats:p>We tested genome-wide associations of four disease phenotypes, including age-related macular degeneration, Crohn’s disease, ulcerative colitis and rheumatoid arthritis, and demonstrated the proposed method could select more functional genes than S-PrediXcan, the previous single-step model for predicting gene-level associations from SNP-level associations.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13040-020-00216-9",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Identification of therapeutic targets from genetic association studies using hierarchical component analysis",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Hao-Chih",
+        "family": "Lee"
+      },
+      {
+        "given": "Osamu",
+        "family": "Ichikawa"
+      },
+      {
+        "given": "Benjamin S.",
+        "family": "Glicksberg"
+      },
+      {
+        "given": "Aparna A.",
+        "family": "Divaraniya"
+      },
+      {
+        "given": "Christine E.",
+        "family": "Becker"
+      },
+      {
+        "given": "Pankaj",
+        "family": "Agarwal"
+      },
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      }
+    ],
+    "container-title": "BioData Mining",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          6,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjp5pf",
+    "container-title-short": "BioData Mining",
+    "PMCID": "PMC7301559",
+    "PMID": "32565911",
+    "id": "57TjOMEA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13040-020-00216-9"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1186/s13059-016-1070-5",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Estimating the population abundance of tissue-infiltrating immune and stromal cell populations using gene expression",
+    "volume": "17",
+    "author": [
+      {
+        "given": "Etienne",
+        "family": "Becht"
+      },
+      {
+        "given": "Nicolas A.",
+        "family": "Giraldo"
+      },
+      {
+        "given": "Laetitia",
+        "family": "Lacroix"
+      },
+      {
+        "given": "Bénédicte",
+        "family": "Buttard"
+      },
+      {
+        "given": "Nabila",
+        "family": "Elarouci"
+      },
+      {
+        "given": "Florent",
+        "family": "Petitprez"
+      },
+      {
+        "given": "Janick",
+        "family": "Selves"
+      },
+      {
+        "given": "Pierre",
+        "family": "Laurent-Puig"
+      },
+      {
+        "given": "Catherine",
+        "family": "Sautès-Fridman"
+      },
+      {
+        "given": "Wolf H.",
+        "family": "Fridman"
+      },
+      {
+        "given": "Aurélien",
+        "family": "de Reyniès"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          10,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f87sgf",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC5073889",
+    "PMID": "27765066",
+    "id": "18TSqd1tG",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-016-1070-5"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>The Critical Assessment of Functional Annotation (CAFA) is an ongoing, global, community-driven effort to evaluate and improve the computational annotation of protein function.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>Here, we report on the results of the third CAFA challenge, CAFA3, that featured an expanded analysis over the previous CAFA rounds, both in terms of volume of data analyzed and the types of analysis performed. In a novel and major new development, computational predictions and assessment goals drove some of the experimental assays, resulting in new functional annotations for more than 1000 genes. Specifically, we performed experimental whole-genome mutation screening in <jats:italic>Candida albicans</jats:italic> and <jats:italic>Pseudomonas aureginosa</jats:italic> genomes, which provided us with genome-wide experimental data for genes associated with biofilm formation and motility. We further performed targeted assays on selected genes in <jats:italic>Drosophila melanogaster</jats:italic>, which we suspected of being involved in long-term memory.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusion</jats:title>\n                <jats:p>We conclude that while predictions of the molecular function and biological process annotations have slightly improved over time, those of the cellular component have not. Term-centric prediction of experimental annotations remains equally challenging; although the performance of the top methods is significantly better than the expectations set by baseline methods in <jats:italic>C. albicans</jats:italic> and <jats:italic>D. melanogaster</jats:italic>, it leaves considerable room and need for improvement. Finally, we report that the CAFA community now involves a broad range of participants with expertise in bioinformatics, biological experimentation, biocuration, and bio-ontologies, working together to improve functional annotation, computational function prediction, and our ability to manage big data in the era of large experimental screens.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13059-019-1835-8",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "The CAFA challenge reports improved protein function prediction and new functional annotations for hundreds of genes through experimental screens",
+    "volume": "20",
+    "author": [
+      {
+        "given": "Naihui",
+        "family": "Zhou"
+      },
+      {
+        "given": "Yuxiang",
+        "family": "Jiang"
+      },
+      {
+        "given": "Timothy R.",
+        "family": "Bergquist"
+      },
+      {
+        "given": "Alexandra J.",
+        "family": "Lee"
+      },
+      {
+        "given": "Balint Z.",
+        "family": "Kacsoh"
+      },
+      {
+        "given": "Alex W.",
+        "family": "Crocker"
+      },
+      {
+        "given": "Kimberley A.",
+        "family": "Lewis"
+      },
+      {
+        "given": "George",
+        "family": "Georghiou"
+      },
+      {
+        "given": "Huy N.",
+        "family": "Nguyen"
+      },
+      {
+        "given": "Md Nafiz",
+        "family": "Hamid"
+      },
+      {
+        "given": "Larry",
+        "family": "Davis"
+      },
+      {
+        "given": "Tunca",
+        "family": "Dogan"
+      },
+      {
+        "given": "Volkan",
+        "family": "Atalay"
+      },
+      {
+        "given": "Ahmet S.",
+        "family": "Rifaioglu"
+      },
+      {
+        "given": "Alperen",
+        "family": "Dalkıran"
+      },
+      {
+        "given": "Rengul",
+        "family": "Cetin Atalay"
+      },
+      {
+        "given": "Chengxin",
+        "family": "Zhang"
+      },
+      {
+        "given": "Rebecca L.",
+        "family": "Hurto"
+      },
+      {
+        "given": "Peter L.",
+        "family": "Freddolino"
+      },
+      {
+        "given": "Yang",
+        "family": "Zhang"
+      },
+      {
+        "given": "Prajwal",
+        "family": "Bhat"
+      },
+      {
+        "given": "Fran",
+        "family": "Supek"
+      },
+      {
+        "given": "José M.",
+        "family": "Fernández"
+      },
+      {
+        "given": "Branislava",
+        "family": "Gemovic"
+      },
+      {
+        "given": "Vladimir R.",
+        "family": "Perovic"
+      },
+      {
+        "given": "Radoslav S.",
+        "family": "Davidović"
+      },
+      {
+        "given": "Neven",
+        "family": "Sumonja"
+      },
+      {
+        "given": "Nevena",
+        "family": "Veljkovic"
+      },
+      {
+        "given": "Ehsaneddin",
+        "family": "Asgari"
+      },
+      {
+        "given": "Mohammad R.K.",
+        "family": "Mofrad"
+      },
+      {
+        "given": "Giuseppe",
+        "family": "Profiti"
+      },
+      {
+        "given": "Castrense",
+        "family": "Savojardo"
+      },
+      {
+        "given": "Pier Luigi",
+        "family": "Martelli"
+      },
+      {
+        "given": "Rita",
+        "family": "Casadio"
+      },
+      {
+        "given": "Florian",
+        "family": "Boecker"
+      },
+      {
+        "given": "Heiko",
+        "family": "Schoof"
+      },
+      {
+        "given": "Indika",
+        "family": "Kahanda"
+      },
+      {
+        "given": "Natalie",
+        "family": "Thurlby"
+      },
+      {
+        "given": "Alice C.",
+        "family": "McHardy"
+      },
+      {
+        "given": "Alexandre",
+        "family": "Renaux"
+      },
+      {
+        "given": "Rabie",
+        "family": "Saidi"
+      },
+      {
+        "given": "Julian",
+        "family": "Gough"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Freitas"
+      },
+      {
+        "given": "Magdalena",
+        "family": "Antczak"
+      },
+      {
+        "given": "Fabio",
+        "family": "Fabris"
+      },
+      {
+        "given": "Mark N.",
+        "family": "Wass"
+      },
+      {
+        "given": "Jie",
+        "family": "Hou"
+      },
+      {
+        "given": "Jianlin",
+        "family": "Cheng"
+      },
+      {
+        "given": "Zheng",
+        "family": "Wang"
+      },
+      {
+        "given": "Alfonso E.",
+        "family": "Romero"
+      },
+      {
+        "given": "Alberto",
+        "family": "Paccanaro"
+      },
+      {
+        "given": "Haixuan",
+        "family": "Yang"
+      },
+      {
+        "given": "Tatyana",
+        "family": "Goldberg"
+      },
+      {
+        "given": "Chenguang",
+        "family": "Zhao"
+      },
+      {
+        "given": "Liisa",
+        "family": "Holm"
+      },
+      {
+        "given": "Petri",
+        "family": "Törönen"
+      },
+      {
+        "given": "Alan J.",
+        "family": "Medlar"
+      },
+      {
+        "given": "Elaine",
+        "family": "Zosa"
+      },
+      {
+        "given": "Itamar",
+        "family": "Borukhov"
+      },
+      {
+        "given": "Ilya",
+        "family": "Novikov"
+      },
+      {
+        "given": "Angela",
+        "family": "Wilkins"
+      },
+      {
+        "given": "Olivier",
+        "family": "Lichtarge"
+      },
+      {
+        "given": "Po-Han",
+        "family": "Chi"
+      },
+      {
+        "given": "Wei-Cheng",
+        "family": "Tseng"
+      },
+      {
+        "given": "Michal",
+        "family": "Linial"
+      },
+      {
+        "given": "Peter W.",
+        "family": "Rose"
+      },
+      {
+        "given": "Christophe",
+        "family": "Dessimoz"
+      },
+      {
+        "given": "Vedrana",
+        "family": "Vidulin"
+      },
+      {
+        "given": "Saso",
+        "family": "Dzeroski"
+      },
+      {
+        "given": "Ian",
+        "family": "Sillitoe"
+      },
+      {
+        "given": "Sayoni",
+        "family": "Das"
+      },
+      {
+        "given": "Jonathan Gill",
+        "family": "Lees"
+      },
+      {
+        "given": "David T.",
+        "family": "Jones"
+      },
+      {
+        "given": "Cen",
+        "family": "Wan"
+      },
+      {
+        "given": "Domenico",
+        "family": "Cozzetto"
+      },
+      {
+        "given": "Rui",
+        "family": "Fa"
+      },
+      {
+        "given": "Mateo",
+        "family": "Torres"
+      },
+      {
+        "given": "Alex",
+        "family": "Warwick Vesztrocy"
+      },
+      {
+        "given": "Jose Manuel",
+        "family": "Rodriguez"
+      },
+      {
+        "given": "Michael L.",
+        "family": "Tress"
+      },
+      {
+        "given": "Marco",
+        "family": "Frasca"
+      },
+      {
+        "given": "Marco",
+        "family": "Notaro"
+      },
+      {
+        "given": "Giuliano",
+        "family": "Grossi"
+      },
+      {
+        "given": "Alessandro",
+        "family": "Petrini"
+      },
+      {
+        "given": "Matteo",
+        "family": "Re"
+      },
+      {
+        "given": "Giorgio",
+        "family": "Valentini"
+      },
+      {
+        "given": "Marco",
+        "family": "Mesiti"
+      },
+      {
+        "given": "Daniel B.",
+        "family": "Roche"
+      },
+      {
+        "given": "Jonas",
+        "family": "Reeb"
+      },
+      {
+        "given": "David W.",
+        "family": "Ritchie"
+      },
+      {
+        "given": "Sabeur",
+        "family": "Aridhi"
+      },
+      {
+        "given": "Seyed Ziaeddin",
+        "family": "Alborzi"
+      },
+      {
+        "given": "Marie-Dominique",
+        "family": "Devignes"
+      },
+      {
+        "given": "Da Chen Emily",
+        "family": "Koo"
+      },
+      {
+        "given": "Richard",
+        "family": "Bonneau"
+      },
+      {
+        "given": "Vladimir",
+        "family": "Gligorijević"
+      },
+      {
+        "given": "Meet",
+        "family": "Barot"
+      },
+      {
+        "given": "Hai",
+        "family": "Fang"
+      },
+      {
+        "given": "Stefano",
+        "family": "Toppo"
+      },
+      {
+        "given": "Enrico",
+        "family": "Lavezzo"
+      },
+      {
+        "given": "Marco",
+        "family": "Falda"
+      },
+      {
+        "given": "Michele",
+        "family": "Berselli"
+      },
+      {
+        "given": "Silvio C.E.",
+        "family": "Tosatto"
+      },
+      {
+        "given": "Marco",
+        "family": "Carraro"
+      },
+      {
+        "given": "Damiano",
+        "family": "Piovesan"
+      },
+      {
+        "given": "Hafeez",
+        "family": "Ur Rehman"
+      },
+      {
+        "given": "Qizhong",
+        "family": "Mao"
+      },
+      {
+        "given": "Shanshan",
+        "family": "Zhang"
+      },
+      {
+        "given": "Slobodan",
+        "family": "Vucetic"
+      },
+      {
+        "given": "Gage S.",
+        "family": "Black"
+      },
+      {
+        "given": "Dane",
+        "family": "Jo"
+      },
+      {
+        "given": "Erica",
+        "family": "Suh"
+      },
+      {
+        "given": "Jonathan B.",
+        "family": "Dayton"
+      },
+      {
+        "given": "Dallas J.",
+        "family": "Larsen"
+      },
+      {
+        "given": "Ashton R.",
+        "family": "Omdahl"
+      },
+      {
+        "given": "Liam J.",
+        "family": "McGuffin"
+      },
+      {
+        "given": "Danielle A.",
+        "family": "Brackenridge"
+      },
+      {
+        "given": "Patricia C.",
+        "family": "Babbitt"
+      },
+      {
+        "given": "Jeffrey M.",
+        "family": "Yunes"
+      },
+      {
+        "given": "Paolo",
+        "family": "Fontana"
+      },
+      {
+        "given": "Feng",
+        "family": "Zhang"
+      },
+      {
+        "given": "Shanfeng",
+        "family": "Zhu"
+      },
+      {
+        "given": "Ronghui",
+        "family": "You"
+      },
+      {
+        "given": "Zihan",
+        "family": "Zhang"
+      },
+      {
+        "given": "Suyang",
+        "family": "Dai"
+      },
+      {
+        "given": "Shuwei",
+        "family": "Yao"
+      },
+      {
+        "given": "Weidong",
+        "family": "Tian"
+      },
+      {
+        "given": "Renzhi",
+        "family": "Cao"
+      },
+      {
+        "given": "Caleb",
+        "family": "Chandler"
+      },
+      {
+        "given": "Miguel",
+        "family": "Amezola"
+      },
+      {
+        "given": "Devon",
+        "family": "Johnson"
+      },
+      {
+        "given": "Jia-Ming",
+        "family": "Chang"
+      },
+      {
+        "given": "Wen-Hung",
+        "family": "Liao"
+      },
+      {
+        "given": "Yi-Wei",
+        "family": "Liu"
+      },
+      {
+        "given": "Stefano",
+        "family": "Pascarelli"
+      },
+      {
+        "given": "Yotam",
+        "family": "Frank"
+      },
+      {
+        "given": "Robert",
+        "family": "Hoehndorf"
+      },
+      {
+        "given": "Maxat",
+        "family": "Kulmanov"
+      },
+      {
+        "given": "Imane",
+        "family": "Boudellioua"
+      },
+      {
+        "given": "Gianfranco",
+        "family": "Politano"
+      },
+      {
+        "given": "Stefano",
+        "family": "Di Carlo"
+      },
+      {
+        "given": "Alfredo",
+        "family": "Benso"
+      },
+      {
+        "given": "Kai",
+        "family": "Hakala"
+      },
+      {
+        "given": "Filip",
+        "family": "Ginter"
+      },
+      {
+        "given": "Farrokh",
+        "family": "Mehryary"
+      },
+      {
+        "given": "Suwisa",
+        "family": "Kaewphan"
+      },
+      {
+        "given": "Jari",
+        "family": "Björne"
+      },
+      {
+        "given": "Hans",
+        "family": "Moen"
+      },
+      {
+        "given": "Martti E.E.",
+        "family": "Tolvanen"
+      },
+      {
+        "given": "Tapio",
+        "family": "Salakoski"
+      },
+      {
+        "given": "Daisuke",
+        "family": "Kihara"
+      },
+      {
+        "given": "Aashish",
+        "family": "Jain"
+      },
+      {
+        "given": "Tomislav",
+        "family": "Šmuc"
+      },
+      {
+        "given": "Adrian",
+        "family": "Altenhoff"
+      },
+      {
+        "given": "Asa",
+        "family": "Ben-Hur"
+      },
+      {
+        "given": "Burkhard",
+        "family": "Rost"
+      },
+      {
+        "given": "Steven E.",
+        "family": "Brenner"
+      },
+      {
+        "given": "Christine A.",
+        "family": "Orengo"
+      },
+      {
+        "given": "Constance J.",
+        "family": "Jeffery"
+      },
+      {
+        "given": "Giovanni",
+        "family": "Bosco"
+      },
+      {
+        "given": "Deborah A.",
+        "family": "Hogan"
+      },
+      {
+        "given": "Maria J.",
+        "family": "Martin"
+      },
+      {
+        "given": "Claire",
+        "family": "O’Donovan"
+      },
+      {
+        "given": "Sean D.",
+        "family": "Mooney"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Predrag",
+        "family": "Radivojac"
+      },
+      {
+        "given": "Iddo",
+        "family": "Friedberg"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          11,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggnxpz",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC6864930",
+    "PMID": "31744546",
+    "id": "DN7TyZzb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-019-1835-8"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>Polygenic risk scores (PRS) are valuable to translate the results of genome-wide association studies (GWAS) into clinical practice. To date, most GWAS have been based on individuals of European-ancestry leading to poor performance in populations of non-European ancestry.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>We introduce the polygenic transcriptome risk score (PTRS), which is based on predicted transcript levels (rather than SNPs), and explore the portability of PTRS across populations using UK Biobank data.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusions</jats:title>\n                <jats:p>We show that PTRS has a significantly higher portability (Wilcoxon <jats:italic>p</jats:italic>=0.013) in the African-descent samples where the loss of performance is most acute with better performance than PRS when used in combination.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13059-021-02591-w",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Polygenic transcriptome risk scores (PTRS) can improve portability of polygenic risk scores across ancestries",
+    "volume": "23",
+    "author": [
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Ani",
+        "family": "Manichaikul"
+      },
+      {
+        "given": "Abraham A.",
+        "family": "Palmer"
+      },
+      {
+        "given": "Nancy J.",
+        "family": "Cox"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2022,
+          1,
+          13
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqtdvn",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC8759285",
+    "PMID": "35027082",
+    "id": "hSYqogYZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-021-02591-w"
+  },
+  {
+    "publisher": "Royal College of Psychiatrists",
+    "issue": "3",
+    "abstract": "<jats:sec><jats:title>Background</jats:title><jats:p>Bipolar disorder and schizophrenia are associated with increased mortality relative to the general population. There is an international emphasis on decreasing this excess mortality.</jats:p></jats:sec><jats:sec><jats:title>Aims</jats:title><jats:p>To determine whether the mortality gap between individuals with bipolar disorder and schizophrenia and the general population has decreased.</jats:p></jats:sec><jats:sec><jats:title>Method</jats:title><jats:p>A nationally representative cohort study using primary care electronic health records from 2000 to 2014, comparing all patients diagnosed with bipolar disorder or schizophrenia and the general population. The primary outcome was all-cause mortality.</jats:p></jats:sec><jats:sec><jats:title>Results</jats:title><jats:p>Individuals with bipolar disorder and schizophrenia had elevated mortality (adjusted hazard ratio (HR) = 1.79, 95% CI 1.67–1.88 and 2.08, 95% CI 1.98–2.19 respectively). Adjusted HRs for bipolar disorder increased by 0.14/year (95% CI 0.10–0.19) from 2006 to 2014. The adjusted HRs for schizophrenia increased gradually from 2004 to 2010 (0.11/year, 95% CI 0.04–0.17) and rapidly after 2010 (0.34/year, 95% CI 0.18–0.49).</jats:p></jats:sec><jats:sec><jats:title>Conclusions</jats:title><jats:p>The mortality gap between individuals with bipolar disorder and schizophrenia, and the general population is widening.</jats:p></jats:sec>",
+    "DOI": "10.1192/bjp.bp.117.202606",
+    "type": "article-journal",
+    "page": "175-181",
+    "source": "Crossref",
+    "title": "Mortality gap for people with bipolar disorder and schizophrenia: UK-based cohort study 2000–2014",
+    "volume": "211",
+    "author": [
+      {
+        "given": "Joseph F.",
+        "family": "Hayes"
+      },
+      {
+        "given": "Louise",
+        "family": "Marston"
+      },
+      {
+        "given": "Kate",
+        "family": "Walters"
+      },
+      {
+        "given": "Michael B.",
+        "family": "King"
+      },
+      {
+        "given": "David P. J.",
+        "family": "Osborn"
+      }
+    ],
+    "container-title": "British Journal of Psychiatry",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gbwcjx",
+    "container-title-short": "Br J Psychiatry",
+    "PMCID": "PMC5579328",
+    "PMID": "28684403",
+    "id": "17LYMnG9n",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1192/bjp.bp.117.202606"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "4",
+    "DOI": "10.1194/jlr.s092007",
+    "type": "article-journal",
+    "page": "741-746",
+    "source": "Crossref",
+    "title": "Niacin: an old lipid drug in a new NAD+ dress",
+    "volume": "60",
+    "author": [
+      {
+        "given": "Mario",
+        "family": "Romani"
+      },
+      {
+        "given": "Dina Carina",
+        "family": "Hofer"
+      },
+      {
+        "given": "Elena",
+        "family": "Katsyuba"
+      },
+      {
+        "given": "Johan",
+        "family": "Auwerx"
+      }
+    ],
+    "container-title": "Journal of Lipid Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpjft",
+    "container-title-short": "Journal of Lipid Research",
+    "PMCID": "PMC6446705",
+    "PMID": "30782960",
+    "id": "7OgaYjeL",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1194/jlr.s092007"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "4",
+    "DOI": "10.1371/journal.pcbi.1004219",
+    "type": "article-journal",
+    "page": "e1004219",
+    "source": "Crossref",
+    "title": "MAGMA: Generalized Gene-Set Analysis of GWAS Data",
+    "volume": "11",
+    "author": [
+      {
+        "given": "Christiaan A.",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Joris M.",
+        "family": "Mooij"
+      },
+      {
+        "given": "Tom",
+        "family": "Heskes"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "PLOS Computational Biology",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Hua",
+        "family": "Tang"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          4,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf92gp",
+    "container-title-short": "PLoS Comput Biol",
+    "PMCID": "PMC4401657",
+    "PMID": "25885710",
+    "id": "19XiXgYmd",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pcbi.1004219"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "1",
+    "DOI": "10.1371/journal.pgen.1007889",
+    "type": "article-journal",
+    "page": "e1007889",
+    "source": "Crossref",
+    "title": "Integrating predicted transcriptome from multiple tissues improves association detection",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Jiamao",
+        "family": "Zheng"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Dan L.",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Vincent",
+        "family": "Plagnol"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          1,
+          22
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghs8vx",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC6358100",
+    "PMID": "30668570",
+    "id": "1FFzCXo1s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1007889"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "12",
+    "DOI": "10.1371/journal.pgen.1008489",
+    "type": "article-journal",
+    "page": "e1008489",
+    "source": "Crossref",
+    "title": "Are drug targets with genetic support twice as likely to be approved? Revised estimates of the impact of genetic support for drug mechanisms on the probability of drug approval",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Emily A.",
+        "family": "King"
+      },
+      {
+        "given": "J. Wade",
+        "family": "Davis"
+      },
+      {
+        "given": "Jacob F.",
+        "family": "Degner"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Jonathan",
+        "family": "Marchini"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          12,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gg957r",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC6907751",
+    "PMID": "31830040",
+    "id": "PgEwSS4Q",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1008489"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "4",
+    "abstract": "<jats:p>Transcriptome-wide association studies (TWAS) have been widely used to integrate transcriptomic and genetic data to study complex human diseases. Within a test dataset lacking transcriptomic data, traditional two-stage TWAS methods first impute gene expression by creating a weighted sum that aggregates SNPs with their corresponding cis-eQTL effects on reference transcriptome. Traditional TWAS methods then employ a linear regression model to assess the association between imputed gene expression and test phenotype, thereby assuming the effect of a cis-eQTL SNP on test phenotype is a linear function of the eQTL’s estimated effect on reference transcriptome. To increase TWAS robustness to this assumption, we propose a novel Variance-Component TWAS procedure (VC-TWAS) that assumes the effects of cis-eQTL SNPs on phenotype are random (with variance proportional to corresponding reference cis-eQTL effects) rather than fixed. VC-TWAS is applicable to both continuous and dichotomous phenotypes, as well as individual-level and summary-level GWAS data. Using simulated data, we show VC-TWAS is more powerful than traditional TWAS methods based on a two-stage Burden test, especially when eQTL genetic effects on test phenotype are no longer a linear function of their eQTL genetic effects on reference transcriptome. We further applied VC-TWAS to both individual-level (N = ~3.4K) and summary-level (N = ~54K) GWAS data to study Alzheimer’s dementia (AD). With the individual-level data, we detected 13 significant risk genes including 6 known GWAS risk genes such as <jats:italic>TOMM40</jats:italic> that were missed by traditional TWAS methods. With the summary-level data, we detected 57 significant risk genes considering only cis-SNPs and 71 significant genes considering both cis- and trans- SNPs, which also validated our findings with the individual-level GWAS data. Our VC-TWAS method is implemented in the TIGAR tool for public use.</jats:p>",
+    "DOI": "10.1371/journal.pgen.1009482",
+    "type": "article-journal",
+    "page": "e1009482",
+    "source": "Crossref",
+    "title": "Novel Variance-Component TWAS method for studying complex human diseases with applications to Alzheimer’s dementia",
+    "volume": "17",
+    "author": [
+      {
+        "given": "Shizhen",
+        "family": "Tang"
+      },
+      {
+        "given": "Aron S.",
+        "family": "Buchman"
+      },
+      {
+        "given": "Philip L.",
+        "family": "De Jager"
+      },
+      {
+        "given": "David A.",
+        "family": "Bennett"
+      },
+      {
+        "given": "Michael P.",
+        "family": "Epstein"
+      },
+      {
+        "given": "Jingjing",
+        "family": "Yang"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Lin",
+        "family": "Chen"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          4,
+          2
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpr3j",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC8046351",
+    "PMID": "33798195",
+    "id": "yEdRP9Xx",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1009482"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "10",
+    "DOI": "10.1371/journal.pone.0109760",
+    "type": "article-journal",
+    "page": "e109760",
+    "source": "Crossref",
+    "title": "Copy Number Loss of the Interferon Gene Cluster in Melanomas Is Linked to Reduced T Cell Infiltrate and Poor Patient Prognosis",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Peter S.",
+        "family": "Linsley"
+      },
+      {
+        "given": "Cate",
+        "family": "Speake"
+      },
+      {
+        "given": "Elizabeth",
+        "family": "Whalen"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      }
+    ],
+    "container-title": "PLoS ONE",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Maria G.",
+        "family": "Castro"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gk9k8s",
+    "container-title-short": "PLoS ONE",
+    "PMCID": "PMC4196925",
+    "PMID": "25314013",
+    "id": "D5XBhzim",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pone.0109760"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "1",
+    "DOI": "10.1371/journal.pone.0192082",
+    "type": "article-journal",
+    "page": "e0192082",
+    "source": "Crossref",
+    "title": "Shared and organism-specific host responses to childhood diarrheal diseases revealed by whole blood transcript profiling",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Hannah A.",
+        "family": "DeBerg"
+      },
+      {
+        "given": "Mussaret B.",
+        "family": "Zaidi"
+      },
+      {
+        "given": "Matthew C.",
+        "family": "Altman"
+      },
+      {
+        "given": "Prasong",
+        "family": "Khaenam"
+      },
+      {
+        "given": "Vivian H.",
+        "family": "Gersuk"
+      },
+      {
+        "given": "Freddy D.",
+        "family": "Campos"
+      },
+      {
+        "given": "Iza",
+        "family": "Perez-Martinez"
+      },
+      {
+        "given": "Mario",
+        "family": "Meza-Segura"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Jacques",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Teresa",
+        "family": "Estrada-Garcia"
+      },
+      {
+        "given": "Peter S.",
+        "family": "Linsley"
+      }
+    ],
+    "container-title": "PLOS ONE",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Karol",
+        "family": "Sestak"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          1,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcwgcr",
+    "container-title-short": "PLoS ONE",
+    "PMCID": "PMC5788382",
+    "PMID": "29377961",
+    "id": "RliFvowC",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pone.0192082"
+  },
+  {
+    "publisher": "Frontiers Media SA",
+    "DOI": "10.3389/fphys.2020.00393",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Lipid and Lipoprotein Metabolism in Microglia",
+    "volume": "11",
+    "author": [
+      {
+        "given": "Bailey A.",
+        "family": "Loving"
+      },
+      {
+        "given": "Kimberley D.",
+        "family": "Bruce"
+      }
+    ],
+    "container-title": "Frontiers in Physiology",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          4,
+          28
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gk92xd",
+    "container-title-short": "Front. Physiol.",
+    "PMCID": "PMC7198855",
+    "PMID": "32411016",
+    "id": "18I4ish9s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.3389/fphys.2020.00393"
+  },
+  {
+    "publisher": "MDPI AG",
+    "issue": "7",
+    "DOI": "10.3390/v5071664",
+    "type": "article-journal",
+    "page": "1664-1681",
+    "source": "Crossref",
+    "title": "Identification of Genes Critical for Resistance to Infection by West Nile Virus Using RNA-Seq Analysis",
+    "volume": "5",
+    "author": [
+      {
+        "given": "Feng",
+        "family": "Qian"
+      },
+      {
+        "given": "Lisa",
+        "family": "Chung"
+      },
+      {
+        "given": "Wei",
+        "family": "Zheng"
+      },
+      {
+        "given": "Vincent",
+        "family": "Bruno"
+      },
+      {
+        "given": "Roger",
+        "family": "Alexander"
+      },
+      {
+        "given": "Zhong",
+        "family": "Wang"
+      },
+      {
+        "given": "Xiaomei",
+        "family": "Wang"
+      },
+      {
+        "given": "Sebastian",
+        "family": "Kurscheid"
+      },
+      {
+        "given": "Hongyu",
+        "family": "Zhao"
+      },
+      {
+        "given": "Erol",
+        "family": "Fikrig"
+      },
+      {
+        "given": "Mark",
+        "family": "Gerstein"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      },
+      {
+        "given": "Ruth",
+        "family": "Montgomery"
+      }
+    ],
+    "container-title": "Viruses",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          7,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f49d7g",
+    "container-title-short": "Viruses",
+    "PMCID": "PMC3738954",
+    "PMID": "23881275",
+    "id": "mtMYROCN",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.3390/v5071664"
+  },
+  {
+    "type": "article",
+    "id": "1DJZvtwP1",
+    "categories": [
+      "LINCS",
+      "L1000",
+      "consensus",
+      "database",
+      "perturbation",
+      "Rephetio"
+    ],
+    "author": [
+      {
+        "family": "Himmelstein",
+        "given": "Daniel"
+      },
+      {
+        "family": "Brueggeman",
+        "given": "Leo"
+      },
+      {
+        "family": "Baranzini",
+        "given": "Sergio"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          3,
+          8
+        ]
+      ]
+    },
+    "abstract": "This repository creates user-friendly datasets for LINCS L1000. We extend the L1000 data offerings with consensus signatures, compound mappings, and chemical similarities. Read about this release on Thinklab.",
+    "DOI": "10.5281/zenodo.47223",
+    "publisher": "Zenodo",
+    "title": "Dhimmel/Lincs V2.0: Refined Consensus Signatures From Lincs L1000",
+    "URL": "https://doi.org/f3mqvr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.5281/zenodo.47223"
+  },
+  {
+    "type": "article",
+    "id": "10KA5jTBQ",
+    "categories": [
+      "indications",
+      "PharmacotherapyDB",
+      "disease modifying",
+      "drugs",
+      "disease",
+      "Rephetio"
+    ],
+    "author": [
+      {
+        "family": "Himmelstein",
+        "given": "Daniel S."
+      },
+      {
+        "literal": "Pouya Khankhanian"
+      },
+      {
+        "family": "Hessler",
+        "given": "Christine S."
+      },
+      {
+        "family": "Green",
+        "given": "Ari J."
+      },
+      {
+        "family": "Baranzini",
+        "given": "Sergio E."
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          3,
+          15
+        ]
+      ]
+    },
+    "abstract": "This is the repository for the initial release of our catalog of drug therapies for disease. The catalog, named PharmacotherapyDB, contains physician curated medical indications. The data for this release is also on figshare\n\nThis initial release contains 97 diseases and 601 drugs. Between these drug–disease pairs, there are 755 disease-modifying therapies, 390 symptomatic therapies, and 243 non-indications. To enable integrative analyses, drugs and diseases are coded using DrugBank and Disease Ontology identifiers.\n\nThe catalog adheres to pathophysiological principals first. Therefore, the catalog includes indications with a poor risk–benefit ratio that are rarely used in the modern clinic. Contributions are welcome as we hope to expand and refine the catalog over time.\n\nRead more on Thinklab.",
+    "DOI": "10.5281/zenodo.47664",
+    "publisher": "Zenodo",
+    "title": "Dhimmel/Indications V1.0. Pharmacotherapydb: The Open Catalog Of Drug Therapies For Disease",
+    "URL": "https://doi.org/f3mqwb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.5281/zenodo.47664"
+  },
+  {
+    "publisher": "eLife Sciences Publications, Ltd",
+    "abstract": "<jats:p>The ability to computationally predict whether a compound treats a disease would improve the economy and success rate of drug approval. This study describes Project Rephetio to systematically model drug efficacy based on 755 existing treatments. First, we constructed Hetionet (neo4j.het.io), an integrative network encoding knowledge from millions of biomedical studies. Hetionet v1.0 consists of 47,031 nodes of 11 types and 2,250,197 relationships of 24 types. Data were integrated from 29 public resources to connect compounds, diseases, genes, anatomies, pathways, biological processes, molecular functions, cellular components, pharmacologic classes, side effects, and symptoms. Next, we identified network patterns that distinguish treatments from non-treatments. Then, we predicted the probability of treatment for 209,168 compound–disease pairs (het.io/repurpose). Our predictions validated on two external sets of treatment and provided pharmacological insights on epilepsy, suggesting they will help prioritize drug repurposing candidates. This study was entirely open and received realtime feedback from 40 community members.</jats:p>",
+    "DOI": "10.7554/elife.26726",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Systematic integration of biomedical knowledge prioritizes drugs for repurposing",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Daniel Scott",
+        "family": "Himmelstein"
+      },
+      {
+        "given": "Antoine",
+        "family": "Lizee"
+      },
+      {
+        "given": "Christine",
+        "family": "Hessler"
+      },
+      {
+        "given": "Leo",
+        "family": "Brueggeman"
+      },
+      {
+        "given": "Sabrina L",
+        "family": "Chen"
+      },
+      {
+        "given": "Dexter",
+        "family": "Hadley"
+      },
+      {
+        "given": "Ari",
+        "family": "Green"
+      },
+      {
+        "given": "Pouya",
+        "family": "Khankhanian"
+      },
+      {
+        "given": "Sergio E",
+        "family": "Baranzini"
+      }
+    ],
+    "container-title": "eLife",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          9,
+          22
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cdfk",
+    "PMCID": "PMC5640425",
+    "PMID": "28936969",
+    "id": "O21tn8vf",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.7554/elife.26726"
+  },
+  {
+    "title": "Depression as a predictor for coronary heart disease. a review and meta-analysis.",
+    "volume": "23",
+    "issue": "1",
+    "page": "51-61",
+    "container-title": "American journal of preventive medicine",
+    "container-title-short": "Am J Prev Med",
+    "ISSN": "0749-3797",
+    "issued": {
+      "date-parts": [
+        [
+          2002,
+          7
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Reiner",
+        "family": "Rugulies"
+      }
+    ],
+    "PMID": "12093424",
+    "DOI": "10.1016/s0749-3797(02)00439-7",
+    "abstract": "To review and quantify the impact of depression on the development of coronary heart disease (CHD) in initially healthy subjects.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/12093424",
+    "type": "article-journal",
+    "id": "10qjLoufR",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:12093424"
+  },
+  {
+    "title": "Elevated rates of protein secretion, evolution, and disease among tissue-specific genes.",
+    "volume": "14",
+    "issue": "1",
+    "page": "54-61",
+    "container-title": "Genome research",
+    "container-title-short": "Genome Res",
+    "ISSN": "1088-9051",
+    "issued": {
+      "date-parts": [
+        [
+          2004,
+          1
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Eitan E",
+        "family": "Winter"
+      },
+      {
+        "given": "Leo",
+        "family": "Goodstadt"
+      },
+      {
+        "given": "Chris P",
+        "family": "Ponting"
+      }
+    ],
+    "PMID": "14707169",
+    "PMCID": "PMC314278",
+    "DOI": "10.1101/gr.1924004",
+    "abstract": "Variation in gene expression has been held responsible for the functional and morphological specialization of tissues. The tissue specificity of genes is known to correlate positively with gene evolution rates. We show here, using large data sets, that when a gene is expressed highly in a small number of tissues, its protein is more likely to be secreted and more likely to be mutated in genetic diseases with Mendelian inheritance. We find that secreted proteins are evolving at faster rates than nonsecreted proteins, and that their evolutionary rates are highly correlated with tissue specificity. However, the impact of secretion on evolutionary rates is countered by tissue-specific constraints that have been held constant over the past 75 million years. We find that disease genes are underrepresented among intracellular and slowly evolving housekeeping genes. These findings illuminate major selective pressures that have shaped the gene repertoires expressed in different mammalian tissues.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/14707169",
+    "type": "article-journal",
+    "id": "18jYvPauB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:14707169"
+  },
+  {
+    "title": "How does gene expression clustering work?",
+    "volume": "23",
+    "issue": "12",
+    "page": "1499-501",
+    "container-title": "Nature biotechnology",
+    "container-title-short": "Nat Biotechnol",
+    "ISSN": "1087-0156",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          12
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Patrik",
+        "family": "D'haeseleer"
+      }
+    ],
+    "PMID": "16333293",
+    "DOI": "10.1038/nbt1205-1499",
+    "abstract": "Clustering is often one of the first steps in gene expression analysis. How do clustering algorithms work, which ones should we use and what can we expect from them?",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/16333293",
+    "type": "article-journal",
+    "id": "VzZoy0BD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:16333293"
+  },
+  {
+    "title": "A modular analysis framework for blood genomics studies: application to systemic lupus erythematosus.",
+    "volume": "29",
+    "issue": "1",
+    "page": "150-64",
+    "container-title": "Immunity",
+    "container-title-short": "Immunity",
+    "ISSN": "1097-4180",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          7,
+          18
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Charles",
+        "family": "Quinn"
+      },
+      {
+        "given": "Jing",
+        "family": "Shen"
+      },
+      {
+        "given": "Pinakeen",
+        "family": "Patel"
+      },
+      {
+        "given": "Casey",
+        "family": "Glaser"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      },
+      {
+        "given": "Dorothee",
+        "family": "Stichweh"
+      },
+      {
+        "given": "Derek",
+        "family": "Blankenship"
+      },
+      {
+        "given": "Lei",
+        "family": "Li"
+      },
+      {
+        "given": "Indira",
+        "family": "Munagala"
+      },
+      {
+        "given": "Lynda",
+        "family": "Bennett"
+      },
+      {
+        "given": "Florence",
+        "family": "Allantaz"
+      },
+      {
+        "given": "Asuncion",
+        "family": "Mejias"
+      },
+      {
+        "given": "Monica",
+        "family": "Ardura"
+      },
+      {
+        "given": "Ellen",
+        "family": "Kaizer"
+      },
+      {
+        "given": "Laurence",
+        "family": "Monnet"
+      },
+      {
+        "given": "Windy",
+        "family": "Allman"
+      },
+      {
+        "given": "Henry",
+        "family": "Randall"
+      },
+      {
+        "given": "Diane",
+        "family": "Johnson"
+      },
+      {
+        "given": "Aimee",
+        "family": "Lanier"
+      },
+      {
+        "given": "Marilynn",
+        "family": "Punaro"
+      },
+      {
+        "given": "Knut M",
+        "family": "Wittkowski"
+      },
+      {
+        "given": "Perrin",
+        "family": "White"
+      },
+      {
+        "given": "Joseph",
+        "family": "Fay"
+      },
+      {
+        "given": "Goran",
+        "family": "Klintmalm"
+      },
+      {
+        "given": "Octavio",
+        "family": "Ramilo"
+      },
+      {
+        "given": "A Karolina",
+        "family": "Palucka"
+      },
+      {
+        "given": "Jacques",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Virginia",
+        "family": "Pascual"
+      }
+    ],
+    "PMID": "18631455",
+    "PMCID": "PMC2727981",
+    "DOI": "10.1016/j.immuni.2008.05.012",
+    "abstract": "The analysis of patient blood transcriptional profiles offers a means to investigate the immunological mechanisms relevant to human diseases on a genome-wide scale. In addition, such studies provide a basis for the discovery of clinically relevant biomarker signatures. We designed a strategy for microarray analysis that is based on the identification of transcriptional modules formed by genes coordinately expressed in multiple disease data sets. Mapping changes in gene expression at the module level generated disease-specific transcriptional fingerprints that provide a stable framework for the visualization and functional interpretation of microarray data. These transcriptional modules were used as a basis for the selection of biomarkers and the development of a multivariate transcriptional indicator of disease progression in patients with systemic lupus erythematosus. Thus, this work describes the implementation and application of a methodology designed to support systems-scale analysis of the human immune system in translational research settings.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/18631455",
+    "type": "article-journal",
+    "id": "S56q1qoc",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:18631455"
+  },
+  {
+    "title": "A large-scale analysis of tissue-specific pathology and gene expression of human disease genes and complexes.",
+    "volume": "105",
+    "issue": "52",
+    "page": "20870-5",
+    "container-title": "Proceedings of the National Academy of Sciences of the United States of America",
+    "container-title-short": "Proc Natl Acad Sci U S A",
+    "ISSN": "1091-6490",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          12,
+          22
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Kasper",
+        "family": "Lage"
+      },
+      {
+        "given": "Niclas Tue",
+        "family": "Hansen"
+      },
+      {
+        "given": "E Olof",
+        "family": "Karlberg"
+      },
+      {
+        "given": "Aron C",
+        "family": "Eklund"
+      },
+      {
+        "given": "Francisco S",
+        "family": "Roque"
+      },
+      {
+        "given": "Patricia K",
+        "family": "Donahoe"
+      },
+      {
+        "given": "Zoltan",
+        "family": "Szallasi"
+      },
+      {
+        "given": "Thomas Skøt",
+        "family": "Jensen"
+      },
+      {
+        "given": "Søren",
+        "family": "Brunak"
+      }
+    ],
+    "PMID": "19104045",
+    "PMCID": "PMC2606902",
+    "DOI": "10.1073/pnas.0810772105",
+    "abstract": "Heritable diseases are caused by germ-line mutations that, despite tissuewide presence, often lead to tissue-specific pathology. Here, we make a systematic analysis of the link between tissue-specific gene expression and pathological manifestations in many human diseases and cancers. Diseases were systematically mapped to tissues they affect from disease-relevant literature in PubMed to create a disease-tissue covariation matrix of high-confidence associations of >1,000 diseases to 73 tissues. By retrieving >2,000 known disease genes, and generating 1,500 disease-associated protein complexes, we analyzed the differential expression of a gene or complex involved in a particular disease in the tissues affected by the disease, compared with nonaffected tissues. When this analysis is scaled to all diseases in our dataset, there is a significant tendency for disease genes and complexes to be overexpressed in the normal tissues where defects cause pathology. In contrast, cancer genes and complexes were not overexpressed in the tissues from which the tumors emanate. We specifically identified a complex involved in XY sex reversal that is testis-specific and down-regulated in ovaries. We also identified complexes in Parkinson disease, cardiomyopathies, and muscular dystrophy syndromes that are similarly tissue specific. Our method represents a conceptual scaffold for organism-spanning analyses and reveals an extensive list of tissue-specific draft molecular pathways, both known and unexpected, that might be disrupted in disease.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/19104045",
+    "type": "article-journal",
+    "id": "e0tRKjE5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:19104045"
+  },
+  {
+    "title": "Relaxed purifying selection and possibly high rate of adaptation in primate lineage-specific genes.",
+    "volume": "2",
+    "page": "393-409",
+    "container-title": "Genome biology and evolution",
+    "container-title-short": "Genome Biol Evol",
+    "ISSN": "1759-6653",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          7,
+          12
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "James J",
+        "family": "Cai"
+      },
+      {
+        "given": "Dmitri A",
+        "family": "Petrov"
+      }
+    ],
+    "PMID": "20624743",
+    "PMCID": "PMC2997544",
+    "DOI": "10.1093/gbe/evq019",
+    "abstract": "Genes in the same organism vary in the time since their evolutionary origin. Without horizontal gene transfer, young genes are necessarily restricted to a few closely related species, whereas old genes can be broadly distributed across the phylogeny. It has been shown that young genes evolve faster than old genes; however, the evolutionary forces responsible for this pattern remain obscure. Here, we classify human-chimp protein-coding genes into different age classes, according to the breath of their phylogenetic distribution. We estimate the strength of purifying selection and the rate of adaptive selection for genes in different age classes. We find that older genes carry fewer and less frequent nonsynonymous single-nucleotide polymorphisms than younger genes suggesting that older genes experience a stronger purifying selection at the protein-coding level. We infer the distribution of fitness effects of new deleterious mutations and find that older genes have proportionally more slightly deleterious mutations and fewer nearly neutral mutations than younger genes. To investigate the role of adaptive selection of genes in different age classes, we determine the selection coefficient (gamma = 2N(e)s) of genes using the MKPRF approach and estimate the ratio of the rate of adaptive nonsynonymous substitution to synonymous substitution (omega(A)) using the DoFE method. Although the proportion of positively selected genes (gamma > 0) is significantly higher in younger genes, we find no correlation between omega(A) and gene age. Collectively, these results provide strong evidence that younger genes are subject to weaker purifying selection and more tenuous evidence that they also undergo adaptive evolution more frequently.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/20624743",
+    "type": "article-journal",
+    "id": "O0e3EhY6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:20624743"
+  },
+  {
+    "title": "Niacin in patients with low HDL cholesterol levels receiving intensive statin therapy.",
+    "volume": "365",
+    "issue": "24",
+    "page": "2255-67",
+    "container-title": "The New England journal of medicine",
+    "container-title-short": "N Engl J Med",
+    "ISSN": "1533-4406",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          11,
+          15
+        ]
+      ]
+    },
+    "author": [
+      {},
+      {
+        "given": "William E",
+        "family": "Boden"
+      },
+      {
+        "given": "Jeffrey L",
+        "family": "Probstfield"
+      },
+      {
+        "given": "Todd",
+        "family": "Anderson"
+      },
+      {
+        "given": "Bernard R",
+        "family": "Chaitman"
+      },
+      {
+        "given": "Patrice",
+        "family": "Desvignes-Nickens"
+      },
+      {
+        "given": "Kent",
+        "family": "Koprowicz"
+      },
+      {
+        "given": "Ruth",
+        "family": "McBride"
+      },
+      {
+        "given": "Koon",
+        "family": "Teo"
+      },
+      {
+        "given": "William",
+        "family": "Weintraub"
+      }
+    ],
+    "PMID": "22085343",
+    "DOI": "10.1056/nejmoa1107579",
+    "abstract": "In patients with established cardiovascular disease, residual cardiovascular risk persists despite the achievement of target low-density lipoprotein (LDL) cholesterol levels with statin therapy. It is unclear whether extended-release niacin added to simvastatin to raise low levels of high-density lipoprotein (HDL) cholesterol is superior to simvastatin alone in reducing such residual risk.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/22085343",
+    "type": "article-journal",
+    "id": "bRPc66OD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:22085343"
+  },
+  {
+    "title": "Architecture of the human regulatory network derived from ENCODE data.",
+    "volume": "489",
+    "issue": "7414",
+    "page": "91-100",
+    "container-title": "Nature",
+    "container-title-short": "Nature",
+    "ISSN": "1476-4687",
+    "issued": {
+      "date-parts": [
+        [
+          2012,
+          9,
+          6
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Mark B",
+        "family": "Gerstein"
+      },
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      },
+      {
+        "given": "Manoj",
+        "family": "Hariharan"
+      },
+      {
+        "given": "Stephen G",
+        "family": "Landt"
+      },
+      {
+        "given": "Koon-Kiu",
+        "family": "Yan"
+      },
+      {
+        "given": "Chao",
+        "family": "Cheng"
+      },
+      {
+        "given": "Xinmeng Jasmine",
+        "family": "Mu"
+      },
+      {
+        "given": "Ekta",
+        "family": "Khurana"
+      },
+      {
+        "given": "Joel",
+        "family": "Rozowsky"
+      },
+      {
+        "given": "Roger",
+        "family": "Alexander"
+      },
+      {
+        "given": "Renqiang",
+        "family": "Min"
+      },
+      {
+        "given": "Pedro",
+        "family": "Alves"
+      },
+      {
+        "given": "Alexej",
+        "family": "Abyzov"
+      },
+      {
+        "given": "Nick",
+        "family": "Addleman"
+      },
+      {
+        "given": "Nitin",
+        "family": "Bhardwaj"
+      },
+      {
+        "given": "Alan P",
+        "family": "Boyle"
+      },
+      {
+        "given": "Philip",
+        "family": "Cayting"
+      },
+      {
+        "given": "Alexandra",
+        "family": "Charos"
+      },
+      {
+        "given": "David Z",
+        "family": "Chen"
+      },
+      {
+        "given": "Yong",
+        "family": "Cheng"
+      },
+      {
+        "given": "Declan",
+        "family": "Clarke"
+      },
+      {
+        "given": "Catharine",
+        "family": "Eastman"
+      },
+      {
+        "given": "Ghia",
+        "family": "Euskirchen"
+      },
+      {
+        "given": "Seth",
+        "family": "Frietze"
+      },
+      {
+        "given": "Yao",
+        "family": "Fu"
+      },
+      {
+        "given": "Jason",
+        "family": "Gertz"
+      },
+      {
+        "given": "Fabian",
+        "family": "Grubert"
+      },
+      {
+        "given": "Arif",
+        "family": "Harmanci"
+      },
+      {
+        "given": "Preti",
+        "family": "Jain"
+      },
+      {
+        "given": "Maya",
+        "family": "Kasowski"
+      },
+      {
+        "given": "Phil",
+        "family": "Lacroute"
+      },
+      {
+        "given": "Jing Jane",
+        "family": "Leng"
+      },
+      {
+        "given": "Jin",
+        "family": "Lian"
+      },
+      {
+        "given": "Hannah",
+        "family": "Monahan"
+      },
+      {
+        "given": "Henriette",
+        "family": "O'Geen"
+      },
+      {
+        "given": "Zhengqing",
+        "family": "Ouyang"
+      },
+      {
+        "given": "E Christopher",
+        "family": "Partridge"
+      },
+      {
+        "given": "Dorrelyn",
+        "family": "Patacsil"
+      },
+      {
+        "given": "Florencia",
+        "family": "Pauli"
+      },
+      {
+        "given": "Debasish",
+        "family": "Raha"
+      },
+      {
+        "given": "Lucia",
+        "family": "Ramirez"
+      },
+      {
+        "given": "Timothy E",
+        "family": "Reddy"
+      },
+      {
+        "given": "Brian",
+        "family": "Reed"
+      },
+      {
+        "given": "Minyi",
+        "family": "Shi"
+      },
+      {
+        "given": "Teri",
+        "family": "Slifer"
+      },
+      {
+        "given": "Jing",
+        "family": "Wang"
+      },
+      {
+        "given": "Linfeng",
+        "family": "Wu"
+      },
+      {
+        "given": "Xinqiong",
+        "family": "Yang"
+      },
+      {
+        "given": "Kevin Y",
+        "family": "Yip"
+      },
+      {
+        "given": "Gili",
+        "family": "Zilberman-Schapira"
+      },
+      {
+        "given": "Serafim",
+        "family": "Batzoglou"
+      },
+      {
+        "given": "Arend",
+        "family": "Sidow"
+      },
+      {
+        "given": "Peggy J",
+        "family": "Farnham"
+      },
+      {
+        "given": "Richard M",
+        "family": "Myers"
+      },
+      {
+        "given": "Sherman M",
+        "family": "Weissman"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      }
+    ],
+    "PMID": "22955619",
+    "PMCID": "PMC4154057",
+    "DOI": "10.1038/nature11245",
+    "abstract": "Transcription factors bind in a combinatorial fashion to specify the on-and-off states of genes; the ensemble of these binding events forms a regulatory network, constituting the wiring diagram for a cell. To examine the principles of the human transcriptional regulatory network, we determined the genomic binding information of 119 transcription-related factors in over 450 distinct experiments. We found the combinatorial, co-association of transcription factors to be highly context specific: distinct combinations of factors bind at specific genomic locations. In particular, there are significant differences in the binding proximal and distal to genes. We organized all the transcription factor binding into a hierarchy and integrated it with other genomic information (for example, microRNA regulation), forming a dense meta-network. Factors at different levels have different properties; for instance, top-level transcription factors more strongly influence expression and middle-level ones co-regulate targets to mitigate information-flow bottlenecks. Moreover, these co-regulations give rise to many enriched network motifs (for example, noise-buffering feed-forward loops). Finally, more connected network components are under stronger selection and exhibit a greater degree of allele-specific activity (that is, differential binding to the two parental alleles). The regulatory information obtained in this study will be crucial for interpreting personal genome sequences and understanding basic principles of human biology and disease.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/22955619",
+    "type": "article-journal",
+    "id": "jrAMOJCD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:22955619"
+  },
+  {
+    "title": "The therapeutic role of niacin in dyslipidemia management.",
+    "volume": "19",
+    "issue": "2",
+    "page": "141-58",
+    "container-title": "Journal of cardiovascular pharmacology and therapeutics",
+    "container-title-short": "J Cardiovasc Pharmacol Ther",
+    "ISSN": "1940-4034",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          12,
+          20
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "William E",
+        "family": "Boden"
+      },
+      {
+        "given": "Mandeep S",
+        "family": "Sidhu"
+      },
+      {
+        "given": "Peter P",
+        "family": "Toth"
+      }
+    ],
+    "PMID": "24363242",
+    "DOI": "10.1177/1074248413514481",
+    "abstract": "There is abundant epidemiologic evidence to support the independent, inverse relationship between low levels of high-density lipoprotein cholesterol (HDL-C) and incident cardiovascular (CV) risk, the clinical importance of which is underscored by the high prevalence of low HDL-C in populations with coronary heart disease (CHD), with or without elevated levels of low-density lipoprotein cholesterol (LDL-C). The National Cholesterol Education Program recommended that optimal treatment for high-risk patients includes both lowering LDL-C and non-HDL-C to risk stratified levels and raising HDL-C when it is <40 mg/dL, although no target level for the latter lipoprotein was suggested. Niacin is the most powerful agent currently available for raising low levels of HDL-C. It also induces significant reductions in triglycerides, lipoprotein(a), and LDL-C levels while also favorably altering LDL particle size and number. In the Coronary Drug Project, niacin treatment was associated with significant reductions in CV events and long-term mortality, similar to the reductions seen in the statin monotherapy trials. In combination trials, niacin plus a statin or bile acid sequestrant produces additive reductions in CHD morbidity and mortality and promotes regression of coronary atherosclerosis. Recently, 2 clinical outcome trials (Atherothrombosis Intervention in Metabolic Syndrome With Low HDL/High Triglycerides and Impact on Global Health Outcomes [AIM-HIGH] and Second Heart Protection Study [HPS-2 THRIVE]) failed to show a reduction in CV events in patients treated to optimally low levels of LDL-C. Despite favorable effects on HDL-C and triglycerides, these studies did not demonstrate incremental clinical benefit with niacin when added to simvastatin, although notable limitations were identified in each of these trials. Thus, there is insufficient evidence from clinical trials to recommend HDL-targeted therapy for additional event reduction at the present time. However, niacin should continue to be used as an adjuvant therapy for reducing atherogenic lipoprotein burden in patients who have not reached their risk stratified LDL-C and non-HDL-C targets. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24363242",
+    "type": "article-journal",
+    "id": "OOAkmacQ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24363242"
+  },
+  {
+    "title": "Democratizing systems immunology with modular transcriptional repertoire analyses.",
+    "volume": "14",
+    "issue": "4",
+    "page": "271-80",
+    "container-title": "Nature reviews. Immunology",
+    "container-title-short": "Nat Rev Immunol",
+    "ISSN": "1474-1741",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          4
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      }
+    ],
+    "PMID": "24662387",
+    "PMCID": "PMC4118927",
+    "DOI": "10.1038/nri3642",
+    "abstract": "Individual elements that constitute the immune system have been characterized over the few past decades, mostly through reductionist approaches. The introduction of large-scale profiling platforms has more recently facilitated the assessment of these elements on a global scale. However, the analysis and the interpretation of such large-scale datasets remains a challenge and a barrier for the wider adoption of systems approaches in immunological and clinical studies. In this Innovation article, we describe an analytical strategy that relies on the a priori determination of co-dependent gene sets for a given biological system. Such modular transcriptional repertoires can in turn be used to simplify the analysis and the interpretation of large-scale datasets, and to design targeted immune fingerprinting assays and web applications that will further facilitate the dissemination of systems approaches in immunology.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24662387",
+    "type": "article-journal",
+    "id": "f2r8LLIn",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24662387"
+  },
+  {
+    "title": "High-density lipoproteins in the prevention of cardiovascular disease: changing the paradigm.",
+    "volume": "96",
+    "issue": "1",
+    "page": "48-56",
+    "container-title": "Clinical pharmacology and therapeutics",
+    "container-title-short": "Clin Pharmacol Ther",
+    "ISSN": "1532-6535",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          4,
+          8
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "S",
+        "family": "Tuteja"
+      },
+      {
+        "given": "D J",
+        "family": "Rader"
+      }
+    ],
+    "PMID": "24713591",
+    "DOI": "10.1038/clpt.2014.79",
+    "abstract": "High-density-lipoprotein cholesterol (HDL-C) has been identified in population studies as an independent inverse predictor of cardiovascular events. Although the causal nature of this association has been questioned, HDL and its major protein, apolipoprotein (apo)A1, have been shown to prevent and reverse atherosclerosis in animal models. In addition, HDL and apoA1 have several putatively atheroprotective functions, such as the ability to promote efflux of cholesterol from macrophages in the artery wall, inhibit vascular inflammation, and enhance endothelial function. Therefore, HDL-C and apoA1 have been investigated as therapeutic targets for coronary heart disease. However, recent clinical trials with drugs that raise HDL-C, such as niacin and inhibitors of cholesteryl ester transfer protein, have been disappointing. Here, we review the current state of the science regarding HDL as a therapeutic target. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24713591",
+    "type": "article-journal",
+    "id": "13meq3Hgt",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24713591"
+  },
+  {
+    "title": "Effects of extended-release niacin with laropiprant in high-risk patients.",
+    "volume": "371",
+    "issue": "3",
+    "page": "203-12",
+    "container-title": "The New England journal of medicine",
+    "container-title-short": "N Engl J Med",
+    "ISSN": "1533-4406",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          7,
+          17
+        ]
+      ]
+    },
+    "author": [
+      {},
+      {
+        "given": "Martin J",
+        "family": "Landray"
+      },
+      {
+        "given": "Richard",
+        "family": "Haynes"
+      },
+      {
+        "given": "Jemma C",
+        "family": "Hopewell"
+      },
+      {
+        "given": "Sarah",
+        "family": "Parish"
+      },
+      {
+        "given": "Theingi",
+        "family": "Aung"
+      },
+      {
+        "given": "Joseph",
+        "family": "Tomson"
+      },
+      {
+        "given": "Karl",
+        "family": "Wallendszus"
+      },
+      {
+        "given": "Martin",
+        "family": "Craig"
+      },
+      {
+        "given": "Lixin",
+        "family": "Jiang"
+      },
+      {
+        "given": "Rory",
+        "family": "Collins"
+      },
+      {
+        "given": "Jane",
+        "family": "Armitage"
+      }
+    ],
+    "PMID": "25014686",
+    "DOI": "10.1056/nejmoa1300955",
+    "abstract": "Patients with evidence of vascular disease are at increased risk for subsequent vascular events despite effective use of statins to lower the low-density lipoprotein (LDL) cholesterol level. Niacin lowers the LDL cholesterol level and raises the high-density lipoprotein (HDL) cholesterol level, but its clinical efficacy and safety are uncertain.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25014686",
+    "type": "article-journal",
+    "id": "13ZGxHjQ5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25014686"
+  },
+  {
+    "title": "A narrow repertoire of transcriptional modules responsive to pyogenic bacteria is impaired in patients carrying loss-of-function mutations in MYD88 or IRAK4.",
+    "volume": "15",
+    "issue": "12",
+    "page": "1134-42",
+    "container-title": "Nature immunology",
+    "container-title-short": "Nat Immunol",
+    "ISSN": "1529-2916",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10,
+          26
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Laia",
+        "family": "Alsina"
+      },
+      {
+        "given": "Elisabeth",
+        "family": "Israelsson"
+      },
+      {
+        "given": "Matthew C",
+        "family": "Altman"
+      },
+      {
+        "given": "Kristen K",
+        "family": "Dang"
+      },
+      {
+        "given": "Pegah",
+        "family": "Ghandil"
+      },
+      {
+        "given": "Laura",
+        "family": "Israel"
+      },
+      {
+        "given": "Horst",
+        "family": "von Bernuth"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      },
+      {
+        "given": "Huanying",
+        "family": "Qin"
+      },
+      {
+        "given": "Zongbo",
+        "family": "Jin"
+      },
+      {
+        "given": "Romain",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Esperanza",
+        "family": "Anguiano"
+      },
+      {
+        "given": "Alexei",
+        "family": "Ionan"
+      },
+      {
+        "given": "Laurent",
+        "family": "Abel"
+      },
+      {
+        "given": "Anne",
+        "family": "Puel"
+      },
+      {
+        "given": "Capucine",
+        "family": "Picard"
+      },
+      {
+        "given": "Virginia",
+        "family": "Pascual"
+      },
+      {
+        "given": "Jean Laurent",
+        "family": "Casanova"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      }
+    ],
+    "PMID": "25344726",
+    "PMCID": "PMC4281021",
+    "DOI": "10.1038/ni.3028",
+    "abstract": "Loss of function of the kinase IRAK4 or the adaptor MyD88 in humans interrupts a pathway critical for pathogen sensing and ignition of inflammation. However, patients with loss-of-function mutations in the genes encoding these factors are, unexpectedly, susceptible to only a limited range of pathogens. We employed a systems approach to investigate transcriptome responses following in vitro exposure of patients' blood to agonists of Toll-like receptors (TLRs) and receptors for interleukin 1 (IL-1Rs) and to whole pathogens. Responses to purified agonists were globally abolished, but variable residual responses were present following exposure to whole pathogens. Further delineation of the latter responses identified a narrow repertoire of transcriptional programs affected by loss of MyD88 function or IRAK4 function. Our work introduces the use of a systems approach for the global assessment of innate immune responses and the characterization of human primary immunodeficiencies. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25344726",
+    "type": "article-journal",
+    "id": "SjGoBywE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25344726"
+  },
+  {
+    "title": "Prognostic value of grip strength: findings from the Prospective Urban Rural Epidemiology (PURE) study.",
+    "volume": "386",
+    "issue": "9990",
+    "page": "266-73",
+    "container-title": "Lancet (London, England)",
+    "container-title-short": "Lancet",
+    "ISSN": "1474-547X",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          5,
+          13
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Darryl P",
+        "family": "Leong"
+      },
+      {
+        "given": "Koon K",
+        "family": "Teo"
+      },
+      {
+        "given": "Sumathy",
+        "family": "Rangarajan"
+      },
+      {
+        "given": "Patricio",
+        "family": "Lopez-Jaramillo"
+      },
+      {
+        "given": "Alvaro",
+        "family": "Avezum"
+      },
+      {
+        "given": "Andres",
+        "family": "Orlandini"
+      },
+      {
+        "given": "Pamela",
+        "family": "Seron"
+      },
+      {
+        "given": "Suad H",
+        "family": "Ahmed"
+      },
+      {
+        "given": "Annika",
+        "family": "Rosengren"
+      },
+      {
+        "given": "Roya",
+        "family": "Kelishadi"
+      },
+      {
+        "given": "Omar",
+        "family": "Rahman"
+      },
+      {
+        "given": "Sumathi",
+        "family": "Swaminathan"
+      },
+      {
+        "given": "Romaina",
+        "family": "Iqbal"
+      },
+      {
+        "given": "Rajeev",
+        "family": "Gupta"
+      },
+      {
+        "given": "Scott A",
+        "family": "Lear"
+      },
+      {
+        "given": "Aytekin",
+        "family": "Oguz"
+      },
+      {
+        "given": "Khalid",
+        "family": "Yusoff"
+      },
+      {
+        "given": "Katarzyna",
+        "family": "Zatonska"
+      },
+      {
+        "given": "Jephat",
+        "family": "Chifamba"
+      },
+      {
+        "given": "Ehimario",
+        "family": "Igumbor"
+      },
+      {
+        "given": "Viswanathan",
+        "family": "Mohan"
+      },
+      {
+        "given": "Ranjit Mohan",
+        "family": "Anjana"
+      },
+      {
+        "given": "Hongqiu",
+        "family": "Gu"
+      },
+      {
+        "given": "Wei",
+        "family": "Li"
+      },
+      {
+        "given": "Salim",
+        "family": "Yusuf"
+      },
+      {}
+    ],
+    "PMID": "25982160",
+    "DOI": "10.1016/s0140-6736(14)62000-6",
+    "abstract": "Reduced muscular strength, as measured by grip strength, has been associated with an increased risk of all-cause and cardiovascular mortality. Grip strength is appealing as a simple, quick, and inexpensive means of stratifying an individual's risk of cardiovascular death. However, the prognostic value of grip strength with respect to the number and range of populations and confounders is unknown. The aim of this study was to assess the independent prognostic importance of grip strength measurement in socioculturally and economically diverse countries.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25982160",
+    "type": "article-journal",
+    "id": "aBVh8zt1",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25982160"
+  },
+  {
+    "title": "Optimized sgRNA design to maximize activity and minimize off-target effects of CRISPR-Cas9.",
+    "volume": "34",
+    "issue": "2",
+    "page": "184-191",
+    "container-title": "Nature biotechnology",
+    "container-title-short": "Nat Biotechnol",
+    "ISSN": "1546-1696",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          1,
+          18
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "John G",
+        "family": "Doench"
+      },
+      {
+        "given": "Nicolo",
+        "family": "Fusi"
+      },
+      {
+        "given": "Meagan",
+        "family": "Sullender"
+      },
+      {
+        "given": "Mudra",
+        "family": "Hegde"
+      },
+      {
+        "given": "Emma W",
+        "family": "Vaimberg"
+      },
+      {
+        "given": "Katherine F",
+        "family": "Donovan"
+      },
+      {
+        "given": "Ian",
+        "family": "Smith"
+      },
+      {
+        "given": "Zuzana",
+        "family": "Tothova"
+      },
+      {
+        "given": "Craig",
+        "family": "Wilen"
+      },
+      {
+        "given": "Robert",
+        "family": "Orchard"
+      },
+      {
+        "given": "Herbert W",
+        "family": "Virgin"
+      },
+      {
+        "given": "Jennifer",
+        "family": "Listgarten"
+      },
+      {
+        "given": "David E",
+        "family": "Root"
+      }
+    ],
+    "PMID": "26780180",
+    "PMCID": "PMC4744125",
+    "DOI": "10.1038/nbt.3437",
+    "abstract": "CRISPR-Cas9-based genetic screens are a powerful new tool in biology. By simply altering the sequence of the single-guide RNA (sgRNA), one can reprogram Cas9 to target different sites in the genome with relative ease, but the on-target activity and off-target effects of individual sgRNAs can vary widely. Here, we use recently devised sgRNA design rules to create human and mouse genome-wide libraries, perform positive and negative selection screens and observe that the use of these rules produced improved results. Additionally, we profile the off-target activity of thousands of sgRNAs and develop a metric to predict off-target sites. We incorporate these findings from large-scale, empirical data to improve our computational design rules and create optimized sgRNA libraries that maximize on-target activity and minimize off-target effects to enable more effective and efficient genetic screens and genome engineering. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/26780180",
+    "type": "article-journal",
+    "id": "vNXTnmxp",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:26780180"
+  },
+  {
+    "title": "Avoiding common pitfalls when clustering biological data.",
+    "volume": "9",
+    "issue": "432",
+    "page": "re6",
+    "container-title": "Science signaling",
+    "container-title-short": "Sci Signal",
+    "ISSN": "1937-9145",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          14
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Tom",
+        "family": "Ronan"
+      },
+      {
+        "given": "Zhijie",
+        "family": "Qi"
+      },
+      {
+        "given": "Kristen M",
+        "family": "Naegle"
+      }
+    ],
+    "PMID": "27303057",
+    "DOI": "10.1126/scisignal.aad1932",
+    "abstract": "Clustering is an unsupervised learning method, which groups data points based on similarity, and is used to reveal the underlying structure of data. This computational approach is essential to understanding and visualizing the complex data that are acquired in high-throughput multidimensional biological experiments. Clustering enables researchers to make biological inferences for further experiments. Although a powerful technique, inappropriate application can lead biological researchers to waste resources and time in experimental follow-up. We review common pitfalls identified from the published molecular biology literature and present methods to avoid them. Commonly encountered pitfalls relate to the high-dimensional nature of biological data from high-throughput experiments, the failure to consider more than one clustering method for a given problem, and the difficulty in determining whether clustering has produced meaningful results. We present concrete examples of problems and solutions (clustering results) in the form of toy problems and real biological data for these issues. We also discuss ensemble clustering as an easy-to-implement method that enables the exploration of multiple clustering solutions and improves robustness of clustering solutions. Increased awareness of common clustering pitfalls will help researchers avoid overinterpreting or misinterpreting the results and missing valuable insights when clustering biological data.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/27303057",
+    "type": "article-journal",
+    "id": "S7LBsfcF",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:27303057"
+  },
+  {
+    "title": "Assessment of the Role of Niacin in Managing Cardiovascular Disease Outcomes: A Systematic Review and Meta-analysis.",
+    "volume": "2",
+    "issue": "4",
+    "page": "e192224",
+    "container-title": "JAMA network open",
+    "container-title-short": "JAMA Netw Open",
+    "ISSN": "2574-3805",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4,
+          5
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Elvira",
+        "family": "D'Andrea"
+      },
+      {
+        "given": "Spencer P",
+        "family": "Hey"
+      },
+      {
+        "given": "Cherie L",
+        "family": "Ramirez"
+      },
+      {
+        "given": "Aaron S",
+        "family": "Kesselheim"
+      }
+    ],
+    "PMID": "30977858",
+    "PMCID": "PMC6481429",
+    "DOI": "10.1001/jamanetworkopen.2019.2224",
+    "abstract": "Niacin remains a therapeutic option for patients with cardiovascular disease, but recent studies have called into question the effectiveness of other drugs that increase high-density lipoprotein cholesterol levels.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/30977858",
+    "type": "article-journal",
+    "id": "ZGvG75Bj",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:30977858"
+  },
+  {
+    "title": "Shared and distinct genetic risk factors for childhood-onset and adult-onset asthma: genome-wide and transcriptome-wide studies.",
+    "volume": "7",
+    "issue": "6",
+    "page": "509-522",
+    "container-title": "The Lancet. Respiratory medicine",
+    "container-title-short": "Lancet Respir Med",
+    "ISSN": "2213-2619",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4,
+          27
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Nathan",
+        "family": "Schoettler"
+      },
+      {
+        "given": "Dan L",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Carole",
+        "family": "Ober"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "PMID": "31036433",
+    "PMCID": "PMC6534440",
+    "DOI": "10.1016/s2213-2600(19)30055-4",
+    "abstract": "Childhood-onset and adult-onset asthma differ with respect to severity and comorbidities. Whether they also differ with respect to genetic risk factors has not been previously investigated in large samples. The goals of this study were to identify shared and distinct genetic risk loci for childhood-onset and adult-onset asthma, and to identify the genes that might mediate the effects of associated variation.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/31036433",
+    "type": "article-journal",
+    "id": "zwpq2IXD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:31036433"
+  },
+  {
+    "title": "UTMOST, a single and cross-tissue TWAS (Transcriptome Wide Association Study), reveals new ASD (Autism Spectrum Disorder) associated genes.",
+    "volume": "11",
+    "issue": "1",
+    "page": "256",
+    "container-title": "Translational psychiatry",
+    "container-title-short": "Transl Psychiatry",
+    "ISSN": "2158-3188",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          4,
+          30
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Cristina",
+        "family": "Rodriguez-Fontenla"
+      },
+      {
+        "given": "Angel",
+        "family": "Carracedo"
+      }
+    ],
+    "PMID": "33931583",
+    "PMCID": "PMC8087708",
+    "DOI": "10.1038/s41398-021-01378-8",
+    "abstract": "Autism spectrum disorders (ASD) is a complex neurodevelopmental disorder that may significantly impact on the affected individual's life. Common variation (SNPs) could explain about 50% of ASD heritability. Despite this fact and the large size of the last GWAS meta-analysis, it is believed that hundreds of risk genes in ASD have yet to be discovered. New tools, such as TWAS (Transcriptome Wide Association Studies) which integrate tissue expression and genetic data, are a great approach to identify new ASD susceptibility genes. The main goal of this study is to use UTMOST with the publicly available summary statistics from the largest ASD GWAS meta-analysis as genetic input. In addition, an in silico biological characterization for the novel associated loci was performed. Our results have shown the association of 4 genes at the brain level (CIPC, PINX1, NKX2-2, and PTPRE) and have highlighted the association of NKX2-2, MANBA, ERI1, and MITF at the gastrointestinal level. The gastrointestinal associations are quite relevant given the well-established but unexplored relationship between ASD and gastrointestinal symptoms. Cross-tissue analysis has shown the association of NKX2-2 and BLK. UTMOST-associated genes together with their in silico biological characterization seems to point to different biological mechanisms underlying ASD etiology. Thus, it would not be restricted to brain tissue and it will involve the participation of other body tissues such as the gastrointestinal.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/33931583",
+    "type": "article-journal",
+    "id": "ktVcsmYD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:33931583"
+  },
+  {
+    "title": "Cluster analysis and display of genome-wide expression patterns.",
+    "volume": "95",
+    "issue": "25",
+    "page": "14863-8",
+    "container-title": "Proceedings of the National Academy of Sciences of the United States of America",
+    "container-title-short": "Proc Natl Acad Sci U S A",
+    "ISSN": "0027-8424",
+    "issued": {
+      "date-parts": [
+        [
+          1998,
+          12,
+          8
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "M B",
+        "family": "Eisen"
+      },
+      {
+        "given": "P T",
+        "family": "Spellman"
+      },
+      {
+        "given": "P O",
+        "family": "Brown"
+      },
+      {
+        "given": "D",
+        "family": "Botstein"
+      }
+    ],
+    "PMID": "9843981",
+    "PMCID": "PMC24541",
+    "DOI": "10.1073/pnas.95.25.14863",
+    "abstract": "A system of cluster analysis for genome-wide expression data from DNA microarray hybridization is described that uses standard statistical algorithms to arrange genes according to similarity in pattern of gene expression. The output is displayed graphically, conveying the clustering and the underlying expression data simultaneously in a form intuitive for biologists. We have found in the budding yeast Saccharomyces cerevisiae that clustering gene expression data groups together efficiently genes of known similar function, and we find a similar tendency in human data. Thus patterns seen in genome-wide expression experiments can be interpreted as indications of the status of cellular processes. Also, coexpression of genes of known function with poorly characterized or novel genes may provide a simple means of gaining leads to the functions of many genes for which information is not available currently.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/9843981",
+    "type": "article-journal",
+    "id": "S4e4WaP3",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:9843981"
+  },
+  {
+    "id": "16RTdMKxI",
+    "type": "book",
+    "note": "original-date: 2019-04-03T09:07:14Z\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://github.com/EBISPOT/EFO-UKB-mappings",
+    "publisher": "EBISPOT",
+    "source": "GitHub",
+    "title": "Mapping UK Biobank to the Experimental Factor Ontology (EFO)",
+    "URL": "https://github.com/EBISPOT/EFO-UKB-mappings",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "issued": {
+      "date-parts": [
+        [
+          "2022",
+          5,
+          3
+        ]
+      ]
+    }
+  },
+  {
+    "id": "GPHGnFRN",
+    "type": "book",
+    "abstract": "harmonization, liftover, and imputation of summary statistics from GWAS",
+    "genre": "Python",
+    "note": "original-date: 2018-10-26T20:24:35Z\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://github.com/hakyimlab/summary-gwas-imputation",
+    "publisher": "hakyimlab",
+    "source": "GitHub",
+    "title": "Harmonization and Imputation Overview",
+    "URL": "https://github.com/hakyimlab/summary-gwas-imputation",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "issued": {
+      "date-parts": [
+        [
+          "2022",
+          10,
+          31
+        ]
+      ]
+    }
+  },
+  {
+    "id": "VKYS05n1",
+    "type": "webpage",
+    "abstract": "A BioProject is a collection of biological data related to a single initiative, originating from a single organization or from a consortium. A BioProject record provides users a single place to find links to the diverse data types generated for that project",
+    "title": "Homo sapiens (ID 232177) - BioProject - NCBI",
+    "URL": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA232177",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA232177"
+  },
+  {
+    "id": "11eausmiy",
+    "type": "webpage",
+    "abstract": "A BioProject is a collection of biological data related to a single initiative, originating from a single organization or from a consortium. A BioProject record provides users a single place to find links to the diverse data types generated for that project",
+    "title": "Homo sapiens (ID 258384) - BioProject - NCBI",
+    "URL": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA258384",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA258384"
+  }
+]
diff --git a/tests/manuscripts/phenoplier_full/metadata.yaml b/tests/manuscripts/phenoplier_full/metadata.yaml
new file mode 100644
index 0000000..ea84d03
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full/metadata.yaml
@@ -0,0 +1,134 @@
+---
+title: "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
+date: 2023-09-09  # Defaults to date generated, but can specify like '2022-10-31'.
+keywords:
+  - genetic studies
+  - functional genomics
+  - gene co-expression
+  - therapeutic targets
+  - drug repurposing
+  - clustering of complex traits
+lang: en-US
+authors:
+  - name: Milton Pividori
+    github: miltondp
+    initials: MP
+    orcid: 0000-0002-3035-4403
+    twitter: miltondp
+    mastodon: miltondp
+    mastodon-server: genomic.social
+    email: milton.pividori@cuanschutz.edu
+    affiliations:
+      - Department of Biomedical Informatics, University of Colorado School of Medicine, Aurora, CO 80045, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+    funders:
+      - The Gordon and Betty Moore Foundation GBMF 4552
+      - The National Human Genome Research Institute (R01 HG010067)
+      - The National Human Genome Research Institute (K99HG011898)
+      - The Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765)
+  
+  - name: Sumei Lu
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Binglan Li
+    orcid: 0000-0002-0103-6107
+    affiliations:
+      - Department of Biomedical Data Science, Stanford University, Stanford, CA 94305, USA.
+
+  - name: Chun Su
+    orcid:  0000-0001-6388-8666
+    github: sckinta
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Matthew E. Johnson
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Wei-Qi Wei
+    affiliations:
+      - Vanderbilt University Medical Center, Nashville, TN 37232, USA
+
+  - name: Qiping Feng
+    orcid: 0000-0002-6213-793X
+    affiliations:
+      - Vanderbilt University Medical Center, Nashville, TN 37232, USA
+
+  - name: Bahram Namjou
+    affiliations:
+      - Cincinnati Children's Hospital Medical Center, Cincinnati, OH 45229, USA
+
+  - name: Krzysztof Kiryluk
+    orcid: 0000-0002-5047-6715
+    twitter: kirylukk
+    affiliations:
+      - Department of Medicine, Division of Nephrology, Vagelos College of Physicians \& Surgeons, Columbia University, New York, NY 10032, USA
+
+  - name: Iftikhar Kullo
+    affiliations:
+      - Mayo Clinic, Rochester, MN 55905, USA
+
+  - name: Yuan Luo
+    orcid: 0000-0003-0195-7456
+    affiliations:
+      - Northwestern University, Chicago, IL 60611, USA
+
+  - name: Blair D. Sullivan
+    github: bdsullivan
+    orcid: 0000-0001-7720-6208
+    twitter: blairdsullivan
+    affiliations:
+      - Kahlert School of Computing, University of Utah, Salt Lake City, UT 84112, USA
+
+  - name: Benjamin F. Voight
+    orcid: 0000-0002-6205-9994
+    twitter: bvoight28
+    github: bvoight
+    affiliations:
+      - Department of Systems Pharmacology and Translational Therapeutics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+      - Institute for Translational Medicine and Therapeutics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Carsten Skarke
+    orcid: 0000-0001-5145-3681
+    twitter: CarstenSkarke
+    affiliations:
+      - Institute for Translational Medicine and Therapeutics, Department of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Marylyn D. Ritchie
+    initials: MDR
+    orcid: 0000-0002-1208-1720
+    twitter: MarylynRitchie
+    email: marylyn@pennmedicine.upenn.edu
+    affiliations:
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Struan F.A. Grant
+    email: grants@chop.edu
+    orcid: 0000-0003-2025-5302
+    twitter: STRUANGRANT
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+      - Division of Endocrinology and Diabetes, Children's Hospital of Philadelphia, Philadelphia, PA, 19104, USA
+      - Division of Human Genetics, Children's Hospital of Philadelphia, Philadelphia, PA, 19104, USA
+      - Department of Pediatrics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA, 19104, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA, 19104, USA
+
+  - name: Casey S. Greene
+    github: cgreene
+    initials: CSG
+    orcid: 0000-0001-8713-9213
+    twitter: GreeneScientist
+    mastodon: greenescientist
+    mastodon-server: genomic.social
+    email: casey.s.greene@cuanschutz.edu
+    affiliations:
+      - Center for Health AI, University of Colorado School of Medicine, Aurora, CO 80045, USA
+      - Department of Biomedical Informatics, University of Colorado School of Medicine, Aurora, CO 80045, USA
+    funders:
+      - The Gordon and Betty Moore Foundation (GBMF 4552)
+      - The National Human Genome Research Institute (R01 HG010067)
+      - The National Cancer Institute (R01 CA237170)
+      - The Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765)
+    corresponding: true
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
new file mode 100644
index 0000000..db38ada
--- /dev/null
+++ b/tests/test_prompt_config.py
@@ -0,0 +1,104 @@
+
+import os
+import re
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+from manubot_ai_editor.editor import ManuscriptEditor, env_vars
+from manubot_ai_editor import models
+from manubot_ai_editor.models import GPT3CompletionModel, RandomManuscriptRevisionModel
+
+MANUSCRIPTS_DIR = Path(__file__).parent / "manuscripts" / "phenoplier_full"
+
+# check that this path exists and resolve it
+def test_manuscripts_dir_exists():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    assert content_dir.exists()
+
+# check that we can create a ManuscriptEditor object
+def test_create_manuscript_editor():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    editor = ManuscriptEditor(content_dir)
+    assert isinstance(editor, ManuscriptEditor)
+
+
+# ==============================================================================
+# === prompts tests, using ai_revision-config.yaml + ai_revision-prompts.yaml
+# ==============================================================================
+
+# check that we can resolve a file to a prompt, and that it's the correct prompt
+def test_resolve_prompt():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    editor = ManuscriptEditor(content_dir)
+
+    phenoplier_files_matches = {
+        # explicitly ignored in ai_revision-config.yaml
+        '00.front-matter.md': (None, 'front-matter'),
+
+        # prompts that match a part of the filename
+        '01.abstract.md': ('Test match abstract.\n', 'abstract'),
+        '02.introduction.md': ('Test match introduction or discussion.\n', 'introduction'),
+
+        # these all match the regex 04\..+\.md, hence why the match object includes a suffix
+        '04.00.results.md': ('Test match results.\n', '04.00.results.md'),
+        '04.05.00.results_framework.md': ('Test match results.\n', '04.05.00.results_framework.md'),
+        '04.05.01.crispr.md': ('Test match results.\n', '04.05.01.crispr.md'),
+        '04.15.drug_disease_prediction.md': ('Test match results.\n', '04.15.drug_disease_prediction.md'),
+        '04.20.00.traits_clustering.md': ('Test match results.\n', '04.20.00.traits_clustering.md'),
+
+        # more prompts that match a part of the filename
+        '05.discussion.md': ('Test match introduction or discussion.\n', 'discussion'),
+        '07.00.methods.md': ('Test match methods.\n', 'methods'),
+
+        # these are all explicitly ignored in ai_revision-config.yaml
+        '10.references.md': (None, 'references'),
+        '15.acknowledgements.md': (None, 'acknowledgements'),
+        '50.00.supplementary_material.md': (None, 'supplementary_material')
+    }
+
+    for filename, (expected_prompt, expected_match) in phenoplier_files_matches.items():
+        prompt, match = editor.prompt_config.get_prompt_for_filename(filename)
+
+        if expected_prompt is None:
+            assert prompt is None
+        else:
+            # we strip() here so that tests still pass, even if the user uses
+            # newlines to separate blocks and isn't aware that the trailing
+            # newline becomes part of the value
+            assert prompt.strip() == expected_prompt.strip()
+
+        if expected_match is None:
+            assert match is None
+        else:
+            assert match.string[match.start():match.end()] == expected_match
+
+# test that we get the default prompt with a None match object for a
+# file we don't recognize
+def test_resolve_default_prompt_unknown_file():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    editor = ManuscriptEditor(content_dir)
+
+    prompt, match = editor.prompt_config.get_prompt_for_filename("some-unknown-file.md")
+
+    assert prompt.strip() == """default prompt text"""
+    assert match is None
+
+# check that a file we don't recognize gets match==None and the 'default' prompt
+# from the ai_revision-config.yaml file
+def test_unresolved_gets_default_prompt():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    editor = ManuscriptEditor(content_dir)
+    prompt, match = editor.prompt_config.get_prompt_for_filename("crazy-filename")
+
+    assert isinstance(prompt, str)
+    assert match is None
+
+    assert prompt.strip() == """default prompt text"""
+
+# ==============================================================================
+# === prompts_files tests, using ai_revision-prompts.yaml w/ai_revision-config.yaml to process ignores, defaults
+# ==============================================================================
+
+# TBC
\ No newline at end of file

From d513eef28a0319701a1ef111dd2189e8973a7033 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 31 Jan 2024 13:33:37 -0700
Subject: [PATCH 04/44] get_prompt_for_filename() now returns "__IGNORE_FILE__"
 for the prompt if it's explicitly ignored. revise_manuscript() now checks for
 this sentinel and ignores the file.

---
 libs/manubot_ai_editor/editor.py        | 12 +++++++++---
 libs/manubot_ai_editor/prompt_config.py | 12 ++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
index b5f6627..fb4ef3f 100644
--- a/libs/manubot_ai_editor/editor.py
+++ b/libs/manubot_ai_editor/editor.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 from manubot_ai_editor import env_vars
-from manubot_ai_editor.prompt_config import ManuscriptPromptConfig
+from manubot_ai_editor.prompt_config import ManuscriptPromptConfig, IGNORE_FILE
 from manubot_ai_editor.models import ManuscriptRevisionModel
 from manubot_ai_editor.utils import (
     get_yaml_field,
@@ -467,11 +467,17 @@ def revise_manuscript(
             # use the ai_revision prompt config to attempt to resolve a prompt
             resolved_prompt, _ = self.prompt_config.get_prompt_for_filename(filename.name)
 
+            # ignore the file if the ai-revision_* config files told us to
+            if resolved_prompt == IGNORE_FILE:
+                continue
+
             # we do not process the file if all hold:
-            # 1. it has no section
+            # 1. it has no section *or* resolved prompt
             # 2. we're unable to resolve it via ai_revision prompt configuration
             # 2. there is no custom prompt
-            if filename_section is None and (
+            if (
+                filename_section is None and resolved_prompt is None
+            ) and (
                 env_vars.CUSTOM_PROMPT not in os.environ
                 or os.environ[env_vars.CUSTOM_PROMPT].strip() == ""
             ):
diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index 56a9119..98c4be2 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -11,6 +11,10 @@ class ManuscriptPromptConfig.
 
 from manubot_ai_editor.utils import get_obj_path
 
+# if returned as the prompt from get_prompt_for_filename() then the file should
+# be ignored
+IGNORE_FILE = "__IGNORE_FILE__"
+
 class ManuscriptConfigException(Exception):
     """
     Parent class for exceptions raised by ManuscriptConfig's loading process.
@@ -85,8 +89,8 @@ def _load_custom_prompts(self) -> (dict, dict):
                 'The "ai_revision-config.yaml" YAML file must exist if "ai_revision-prompts.yaml" begins with the "prompts" key.'
             )
 
-        prompts = data.get('prompts', {})
-        prompts_files = data.get('prompts_files', {})
+        prompts = data.get('prompts')
+        prompts_files = data.get('prompts_files')
 
         return (prompts, prompts_files)
 
@@ -111,7 +115,7 @@ def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (O
         # first, check the ignore list to see if we should bail early
         for ignore in get_obj_path(self.config, ('files', 'ignore'), missing=[]):
             if (m := re.search(ignore, filename)):
-                return (None, m)
+                return (IGNORE_FILE, m)
 
         # FIXME: which takes priority, the files collection in ai_revision-config.yaml
         #  or the prompt_file? we went with config taking precendence for now
@@ -135,7 +139,7 @@ def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (O
         if self.prompts_files:
             for pattern, prompt in self.prompts_files.items():
                 if (m := re.search(pattern, filename)):
-                    return (prompt, m)
+                    return (prompt if prompt is not None else IGNORE_FILE, m)
 
         # finally, return the default prompt
         return (

From 570499850f9fce5423385df88905adfb619b8242 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 31 Jan 2024 13:34:24 -0700
Subject: [PATCH 05/44] Adds testing utility mock_unify_open(), which allows us
 to merge manuscript folders and config folders temporarily for testing. Adds
 set_directory() helper to temporarily switch folders.

---
 tests/utils/__init__.py                       |  0
 tests/utils/dir_union.py                      | 54 +++++++++++++++++++
 .../dir_union_fixtures/original/test.txt      |  1 +
 .../dir_union_fixtures/patched/another.txt    |  1 +
 .../dir_union_fixtures/patched/sub/third.txt  |  1 +
 tests/utils/test_dir_union.py                 | 32 +++++++++++
 6 files changed, 89 insertions(+)
 create mode 100644 tests/utils/__init__.py
 create mode 100644 tests/utils/dir_union.py
 create mode 100644 tests/utils/dir_union_fixtures/original/test.txt
 create mode 100644 tests/utils/dir_union_fixtures/patched/another.txt
 create mode 100644 tests/utils/dir_union_fixtures/patched/sub/third.txt
 create mode 100644 tests/utils/test_dir_union.py

diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/utils/dir_union.py b/tests/utils/dir_union.py
new file mode 100644
index 0000000..bb513a6
--- /dev/null
+++ b/tests/utils/dir_union.py
@@ -0,0 +1,54 @@
+import os
+from pathlib import Path
+from unittest import mock
+
+from contextlib import contextmanager
+
+@contextmanager
+def set_directory(new):
+    """
+    Given a path, sets it as the current working directory,
+    then sets it back once the context has been exited.
+
+    Note that if we upgrade to Python 3.11, this method can be replaced
+    with https://docs.python.org/3/library/contextlib.html#contextlib.chdir
+    """
+    
+    # store the current path so we can return to it
+    original = Path().absolute()
+
+    try:
+        os.chdir(new)
+        yield
+    finally:
+        os.chdir(original)
+
+def mock_unify_open(original, patched):
+    """
+    Given paths to an 'original' and 'patched' folder,
+    patches open() to first check the patched folder for the
+    target file, then checks the original folder if it's not found
+    in the patched folder.
+    """
+    builtin_open = open
+
+    def unify_open(*args, **kwargs):
+        try:
+            # first, try to open the file from within patched
+
+            # resolve all paths: the original, patched, and requested file
+            target_full_path = Path(args[0]).absolute()
+            rewritten_path = (
+                str(target_full_path)
+                    .replace(
+                        str(original.absolute()),
+                        str(patched.absolute())
+                    )
+            )
+
+            return builtin_open(rewritten_path, *(args[1:]), **kwargs)
+        except FileNotFoundError:
+            # resort to opening it normally
+            return builtin_open(*args, **kwargs)
+
+    return unify_open
diff --git a/tests/utils/dir_union_fixtures/original/test.txt b/tests/utils/dir_union_fixtures/original/test.txt
new file mode 100644
index 0000000..30f51a3
--- /dev/null
+++ b/tests/utils/dir_union_fixtures/original/test.txt
@@ -0,0 +1 @@
+hello, world!
\ No newline at end of file
diff --git a/tests/utils/dir_union_fixtures/patched/another.txt b/tests/utils/dir_union_fixtures/patched/another.txt
new file mode 100644
index 0000000..e291c2b
--- /dev/null
+++ b/tests/utils/dir_union_fixtures/patched/another.txt
@@ -0,0 +1 @@
+patched in via unify mock
\ No newline at end of file
diff --git a/tests/utils/dir_union_fixtures/patched/sub/third.txt b/tests/utils/dir_union_fixtures/patched/sub/third.txt
new file mode 100644
index 0000000..6f13b2d
--- /dev/null
+++ b/tests/utils/dir_union_fixtures/patched/sub/third.txt
@@ -0,0 +1 @@
+a third file
\ No newline at end of file
diff --git a/tests/utils/test_dir_union.py b/tests/utils/test_dir_union.py
new file mode 100644
index 0000000..0a21d46
--- /dev/null
+++ b/tests/utils/test_dir_union.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+from unittest import mock
+
+from .dir_union import mock_unify_open, set_directory
+
+# tests for mock_unify_open
+
+UNIFY_TEST_DIR = Path(__file__).parent / "dir_union_fixtures"
+UNIFY_ORIG_DIR = UNIFY_TEST_DIR / "original"
+UNIFY_PATCHED_DIR = UNIFY_TEST_DIR / "patched"
+
+@mock.patch("builtins.open", mock_unify_open(UNIFY_ORIG_DIR, UNIFY_PATCHED_DIR))
+def test_unify_folder_mock():
+    # test that we can still open files in the original folder
+    with open(UNIFY_ORIG_DIR / "test.txt") as fp:
+        assert fp.read().strip() == "hello, world!"
+    # test that the patched folder takes precedence
+    with open(UNIFY_ORIG_DIR / "another.txt") as fp:
+        assert fp.read().strip() == "patched in via unify mock"
+
+@mock.patch("builtins.open", mock_unify_open(UNIFY_ORIG_DIR, UNIFY_PATCHED_DIR))
+def test_unify_folder_mock_relative_paths():
+    with set_directory(UNIFY_ORIG_DIR):
+        # test that we can still open files in the original folder
+        with open("./test.txt") as fp:
+            assert fp.read().strip() == "hello, world!"
+        # test that the patched folder takes precedence
+        with open("./another.txt") as fp:
+            assert fp.read().strip() == "patched in via unify mock"
+        # test that subfolders in the patched folder can be used
+        with open("./sub/third.txt") as fp:
+            assert fp.read().strip() == "a third file"

From 68ac05e292ae5218e09d1911ef504ea952add0fd Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 31 Jan 2024 13:35:31 -0700
Subject: [PATCH 06/44] Adds tests for four scenarios mentioned in issue 31.
 Adds ignore test on the phenoplier full manuscript for the "both files"
 scenario.

---
 .../ai_revision-config.yaml                   |  25 ++++
 .../ai_revision-prompts.yaml                  |  40 ++++++
 .../ai_revision-prompts.yaml                  |  42 +++++++
 .../phenoplier_full/ai_revision-config.yaml   |   0
 .../phenoplier_full/ai_revision-prompts.yaml  |   0
 .../ai_revision-config.yaml                   |   5 +
 .../ai_revision-prompts.yaml                  |   3 +
 tests/test_prompt_config.py                   | 115 ++++++++++++++++--
 8 files changed, 220 insertions(+), 10 deletions(-)
 create mode 100644 tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
 create mode 100644 tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
 create mode 100644 tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
 rename tests/{manuscripts => config_loader_fixtures}/phenoplier_full/ai_revision-config.yaml (100%)
 rename tests/{manuscripts => config_loader_fixtures}/phenoplier_full/ai_revision-prompts.yaml (100%)
 create mode 100644 tests/config_loader_fixtures/single_generic_prompt/ai_revision-config.yaml
 create mode 100644 tests/config_loader_fixtures/single_generic_prompt/ai_revision-prompts.yaml

diff --git a/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml b/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
new file mode 100644
index 0000000..dd71dcd
--- /dev/null
+++ b/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
@@ -0,0 +1,25 @@
+files:
+  matchings:
+    - files:
+        - abstract
+      prompt: abstract
+    - files:
+        - introduction
+      prompt: introduction_discussion
+    - files:
+        - 04\..+\.md
+      prompt: results
+    - files:
+        - discussion
+      prompt: introduction_discussion
+    - files:
+        - methods
+      prompt: methods
+  
+  default_prompt: This is the default prompt
+  
+  ignore:
+    - front-matter
+    - acknowledgements
+    - supplementary_material
+    - references
diff --git a/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
new file mode 100644
index 0000000..281e2ab
--- /dev/null
+++ b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
@@ -0,0 +1,40 @@
+prompts:
+  abstract: |
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  introduction_discussion: |
+    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  results: |
+    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      most references to figures and tables are kept,
+      the details are enough to clearly explain the outcomes,
+      sentences are concise and to the point,
+      the text minimizes the use of jargon,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text has a clear sentence structure
+
+  methods: |
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+       most of the citations to other academic papers are kept,
+       most of the technical details are kept,
+       most references to equations (such as "Equation (@id)") are kept,
+       all equations definitions (such as '*equation_definition') are included with newlines before and after,
+       the most important symbols in equations are defined,
+       the text grammar is correct,
+       spelling errors are fixed,
+       and the text has a clear sentence structure
+
+  default: |
+    Proofread the following paragraph
diff --git a/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml b/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
new file mode 100644
index 0000000..dde172e
--- /dev/null
+++ b/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
@@ -0,0 +1,42 @@
+prompts_files:
+  abstract: |
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  introduction|discussion: |
+    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  results: |
+    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      most references to figures and tables are kept,
+      the details are enough to clearly explain the outcomes,
+      sentences are concise and to the point,
+      the text minimizes the use of jargon,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text has a clear sentence structure
+
+  methods: |
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+       most of the citations to other academic papers are kept,
+       most of the technical details are kept,
+       most references to equations (such as "Equation (@id)") are kept,
+       all equations definitions (such as '*equation_definition') are included with newlines before and after,
+       the most important symbols in equations are defined,
+       the text grammar is correct,
+       spelling errors are fixed,
+       and the text has a clear sentence structure
+
+  references: null
+
+  \.md$: |
+    Proofread the following paragraph
diff --git a/tests/manuscripts/phenoplier_full/ai_revision-config.yaml b/tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml
similarity index 100%
rename from tests/manuscripts/phenoplier_full/ai_revision-config.yaml
rename to tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml
diff --git a/tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml b/tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml
similarity index 100%
rename from tests/manuscripts/phenoplier_full/ai_revision-prompts.yaml
rename to tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml
diff --git a/tests/config_loader_fixtures/single_generic_prompt/ai_revision-config.yaml b/tests/config_loader_fixtures/single_generic_prompt/ai_revision-config.yaml
new file mode 100644
index 0000000..3ef6a49
--- /dev/null
+++ b/tests/config_loader_fixtures/single_generic_prompt/ai_revision-config.yaml
@@ -0,0 +1,5 @@
+files:
+  ignore:
+    - front\-matter
+    - back\-matter
+    - response\-to\-reviewers
diff --git a/tests/config_loader_fixtures/single_generic_prompt/ai_revision-prompts.yaml b/tests/config_loader_fixtures/single_generic_prompt/ai_revision-prompts.yaml
new file mode 100644
index 0000000..bab7fca
--- /dev/null
+++ b/tests/config_loader_fixtures/single_generic_prompt/ai_revision-prompts.yaml
@@ -0,0 +1,3 @@
+prompts_files:
+  \.md$: |
+    Proofread the following paragraph
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index db38ada..c3d7c25 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -1,14 +1,13 @@
 
-import os
-import re
 from pathlib import Path
 from unittest import mock
 
+from manubot_ai_editor.editor import ManuscriptEditor
+from manubot_ai_editor.models import GPT3CompletionModel, RandomManuscriptRevisionModel
+from manubot_ai_editor.prompt_config import IGNORE_FILE
 import pytest
 
-from manubot_ai_editor.editor import ManuscriptEditor, env_vars
-from manubot_ai_editor import models
-from manubot_ai_editor.models import GPT3CompletionModel, RandomManuscriptRevisionModel
+from utils.dir_union import mock_unify_open
 
 MANUSCRIPTS_DIR = Path(__file__).parent / "manuscripts" / "phenoplier_full"
 
@@ -27,15 +26,20 @@ def test_create_manuscript_editor():
 # ==============================================================================
 # === prompts tests, using ai_revision-config.yaml + ai_revision-prompts.yaml
 # ==============================================================================
+    
+# contains standard prompt, config files for phenoplier_full
+# (this is merged into the manuscript folder using the mock_unify_open mock)
+PHENOPLIER_PROMPTS_DIR = Path(__file__).parent / "config_loader_fixtures" / "phenoplier_full"
 
 # check that we can resolve a file to a prompt, and that it's the correct prompt
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PHENOPLIER_PROMPTS_DIR))
 def test_resolve_prompt():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
     editor = ManuscriptEditor(content_dir)
 
     phenoplier_files_matches = {
         # explicitly ignored in ai_revision-config.yaml
-        '00.front-matter.md': (None, 'front-matter'),
+        '00.front-matter.md': (IGNORE_FILE, 'front-matter'),
 
         # prompts that match a part of the filename
         '01.abstract.md': ('Test match abstract.\n', 'abstract'),
@@ -53,9 +57,9 @@ def test_resolve_prompt():
         '07.00.methods.md': ('Test match methods.\n', 'methods'),
 
         # these are all explicitly ignored in ai_revision-config.yaml
-        '10.references.md': (None, 'references'),
-        '15.acknowledgements.md': (None, 'acknowledgements'),
-        '50.00.supplementary_material.md': (None, 'supplementary_material')
+        '10.references.md': (IGNORE_FILE, 'references'),
+        '15.acknowledgements.md': (IGNORE_FILE, 'acknowledgements'),
+        '50.00.supplementary_material.md': (IGNORE_FILE, 'supplementary_material')
     }
 
     for filename, (expected_prompt, expected_match) in phenoplier_files_matches.items():
@@ -76,6 +80,7 @@ def test_resolve_prompt():
 
 # test that we get the default prompt with a None match object for a
 # file we don't recognize
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PHENOPLIER_PROMPTS_DIR))
 def test_resolve_default_prompt_unknown_file():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
     editor = ManuscriptEditor(content_dir)
@@ -87,6 +92,7 @@ def test_resolve_default_prompt_unknown_file():
 
 # check that a file we don't recognize gets match==None and the 'default' prompt
 # from the ai_revision-config.yaml file
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PHENOPLIER_PROMPTS_DIR))
 def test_unresolved_gets_default_prompt():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
     editor = ManuscriptEditor(content_dir)
@@ -101,4 +107,93 @@ def test_unresolved_gets_default_prompt():
 # === prompts_files tests, using ai_revision-prompts.yaml w/ai_revision-config.yaml to process ignores, defaults
 # ==============================================================================
 
-# TBC
\ No newline at end of file
+# the following tests are derived from examples in
+# https://github.com/manubot/manubot-ai-editor/issues/31
+# we test four different scenarios from ./config_loader_fixtures:
+# - Only ai_revision-prompts.yaml is defined (only_revision_prompts)
+ONLY_REV_PROMPTS_DIR = Path(__file__).parent / "config_loader_fixtures" / "only_revision_prompts"
+# - Both ai_revision-prompts.yaml and ai_revision-config.yaml are defined (both_prompts_config)
+BOTH_PROMPTS_CONFIG_DIR = Path(__file__).parent / "config_loader_fixtures" / "both_prompts_config"
+# - Only a single, generic prompt is defined (single_generic_prompt)
+SINGLE_GENERIC_PROMPT_DIR = Path(__file__).parent / "config_loader_fixtures" / "single_generic_prompt"
+
+# ---
+# test ManuscriptEditor.prompt_config sub-attributes are set correctly
+# ---
+
+def get_editor():
+    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+    editor = ManuscriptEditor(content_dir)
+    assert isinstance(editor, ManuscriptEditor)
+    return editor
+
+def test_no_config_unloaded():
+    """
+    With no config files defined, the ManuscriptPromptConfig object should
+    have its attributes set to None.
+    """
+    editor = get_editor()
+    
+    # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
+    assert editor.prompt_config.prompts is None
+    assert editor.prompt_config.prompts_files is None
+    assert editor.prompt_config.config is None
+
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, ONLY_REV_PROMPTS_DIR))
+def test_only_rev_prompts_loaded():
+    editor = get_editor()
+
+    # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
+    assert editor.prompt_config.prompts is  None
+    assert editor.prompt_config.prompts_files is not None
+    assert editor.prompt_config.config is None
+
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, BOTH_PROMPTS_CONFIG_DIR))
+def test_both_prompts_loaded():
+    editor = get_editor()
+
+    # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
+    assert editor.prompt_config.prompts is not None
+    assert editor.prompt_config.prompts_files is None
+    assert editor.prompt_config.config is not None
+
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, SINGLE_GENERIC_PROMPT_DIR))
+def test_single_generic_loaded():
+    editor = get_editor()
+
+    # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
+    assert editor.prompt_config.prompts is None
+    assert editor.prompt_config.prompts_files is not None
+    assert editor.prompt_config.config is not None
+
+# ---
+# test that ignored files are ignored in applicable scenarios
+# ---
+    
+# places in configs where files can be ignored:
+# ai_revision-config.yaml: the `files.ignore` key
+# ai_revision-prompts.yaml: when a prompt in `prompts_files` has a value of null
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        RandomManuscriptRevisionModel(),
+        GPT3CompletionModel(None, None),
+    ],
+)
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, BOTH_PROMPTS_CONFIG_DIR))
+def test_revise_entire_manuscript(tmp_path, model):
+    print(f"\n{str(tmp_path)}\n")
+    me = get_editor()
+
+    model.title = me.title
+    model.keywords = me.keywords
+
+    output_folder = tmp_path
+    assert output_folder.exists()
+
+    me.revise_manuscript(output_folder, model)
+
+    # after processing ignores, we should be left with 9 files from the original 12
+    output_md_files = list(output_folder.glob("*.md"))
+    assert len(output_md_files) == 9

From de5b3c92bb1c09474155477347ed8771f84f24d3 Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:42:23 -0700
Subject: [PATCH 07/44] add pre-commit config

---
 .pre-commit-config.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d789ed9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,29 @@
+default_language_version:
+  python: python3.10
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      # Check for files that contain merge conflict strings.
+      - id: check-merge-conflict
+      # Check for debugger imports and py37+ `breakpoint()` calls in python source.
+      - id: debug-statements
+      # Replaces or checks mixed line ending
+      - id: mixed-line-ending
+      # Check for files that would conflict in case-insensitive filesystems
+      - id: check-case-conflict
+      # This hook checks toml files for parseable syntax.
+      - id: check-toml
+      # This hook checks yaml files for parseable syntax.
+      - id: check-yaml
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.2.0
+    hooks:
+      - id: ruff
+        args:
+        - --fix
+  - repo: https://github.com/python/black
+    rev: 24.1.1
+    hooks:
+      - id: black
+        language_version: python3

From 49ab292bd7df9de5b47722099a85e1cc98a7e2f7 Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:49:51 -0700
Subject: [PATCH 08/44] fix python code style

---
 libs/manubot_ai_editor/editor.py        | 32 +++++----
 libs/manubot_ai_editor/models.py        | 14 ++--
 libs/manubot_ai_editor/prompt_config.py | 50 +++++++++-----
 libs/manubot_ai_editor/utils.py         |  3 +-
 tests/conftest.py                       | 11 +++-
 tests/test_model_revision.py            |  2 +-
 tests/test_prompt_config.py             | 86 +++++++++++++++++--------
 tests/utils/dir_union.py                | 13 ++--
 tests/utils/test_dir_union.py           |  2 +
 9 files changed, 136 insertions(+), 77 deletions(-)

diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
index fb4ef3f..a445dbf 100644
--- a/libs/manubot_ai_editor/editor.py
+++ b/libs/manubot_ai_editor/editor.py
@@ -29,9 +29,7 @@ def __init__(self, content_dir: str | Path):
         self.keywords = get_yaml_field(metadata_file, "keywords")
 
         self.prompt_config = ManuscriptPromptConfig(
-            content_dir=content_dir,
-            title=self.title,
-            keywords=self.keywords
+            content_dir=content_dir, title=self.title, keywords=self.keywords
         )
 
     @staticmethod
@@ -123,9 +121,7 @@ def revise_and_write_paragraph(
         error_message = None
         try:
             paragraph_revised = revision_model.revise_paragraph(
-                paragraph_text,
-                section_name,
-                resolved_prompt=resolved_prompt
+                paragraph_text, section_name, resolved_prompt=resolved_prompt
             )
 
             if paragraph_revised.strip() == "":
@@ -258,7 +254,7 @@ def revise_file(
         output_dir: Path | str,
         revision_model: ManuscriptRevisionModel,
         section_name: str = None,
-        resolved_prompt: str = None
+        resolved_prompt: str = None,
     ):
         """
         It revises an entire Markdown file and writes the revised file to the output directory.
@@ -388,7 +384,11 @@ def revise_file(
 
                     # revise and write paragraph to output file
                     self.revise_and_write_paragraph(
-                        paragraph, revision_model, section_name, resolved_prompt=resolved_prompt, outfile=outfile
+                        paragraph,
+                        revision_model,
+                        section_name,
+                        resolved_prompt=resolved_prompt,
+                        outfile=outfile,
                     )
 
                     # clear the paragraph list
@@ -430,7 +430,11 @@ def revise_file(
             # output file
             if paragraph:
                 self.revise_and_write_paragraph(
-                    paragraph, revision_model, section_name, resolved_prompt=None, outfile=outfile
+                    paragraph,
+                    revision_model,
+                    section_name,
+                    resolved_prompt=None,
+                    outfile=outfile,
                 )
 
     def revise_manuscript(
@@ -465,7 +469,9 @@ def revise_manuscript(
             filename_section = self.get_section_from_filename(filename.name)
 
             # use the ai_revision prompt config to attempt to resolve a prompt
-            resolved_prompt, _ = self.prompt_config.get_prompt_for_filename(filename.name)
+            resolved_prompt, _ = self.prompt_config.get_prompt_for_filename(
+                filename.name
+            )
 
             # ignore the file if the ai-revision_* config files told us to
             if resolved_prompt == IGNORE_FILE:
@@ -475,9 +481,7 @@ def revise_manuscript(
             # 1. it has no section *or* resolved prompt
             # 2. we're unable to resolve it via ai_revision prompt configuration
             # 2. there is no custom prompt
-            if (
-                filename_section is None and resolved_prompt is None
-            ) and (
+            if (filename_section is None and resolved_prompt is None) and (
                 env_vars.CUSTOM_PROMPT not in os.environ
                 or os.environ[env_vars.CUSTOM_PROMPT].strip() == ""
             ):
@@ -496,5 +500,5 @@ def revise_manuscript(
                 output_dir,
                 revision_model,
                 section_name=filename_section,
-                resolved_prompt=resolved_prompt
+                resolved_prompt=resolved_prompt,
             )
diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 3965caa..9bb1bf2 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -30,7 +30,7 @@ def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         Returns:
             Revised paragraph text.
         """
-        raise NotImplemented
+        raise NotImplementedError
 
     @abstractmethod
     def get_prompt(self, paragraph_text, section_name):
@@ -38,7 +38,7 @@ def get_prompt(self, paragraph_text, section_name):
         Returns the prompt to be used for the revision of a paragraph that
         belongs to a given section.
         """
-        raise NotImplemented
+        raise NotImplementedError
 
 
 class DummyManuscriptRevisionModel(ManuscriptRevisionModel):
@@ -90,7 +90,9 @@ def __init__(self):
         super().__init__()
         self.sentence_end_pattern = re.compile(r"\n")
 
-    def revise_paragraph(self, paragraph_text: str, section_name: str, resolved_prompt=None) -> str:
+    def revise_paragraph(
+        self, paragraph_text: str, section_name: str, resolved_prompt=None
+    ) -> str:
         """
         It takes each sentence of the paragraph and randomizes the words.
         """
@@ -221,7 +223,9 @@ def __init__(
         # adjust options if edits or chat endpoint was selected
         self.endpoint = "chat"
 
-        if model_engine.startswith(("text-davinci-", "text-curie-", "text-babbage-", "text-ada-")):
+        if model_engine.startswith(
+            ("text-davinci-", "text-curie-", "text-babbage-", "text-ada-")
+        ):
             self.endpoint = "completions"
 
             if "-edit-" in model_engine:
@@ -289,7 +293,7 @@ def get_prompt(
         #    used.
 
         custom_prompt = None
-        if ((c := os.environ.get(env_vars.CUSTOM_PROMPT, "").strip()) and c != ""):
+        if (c := os.environ.get(env_vars.CUSTOM_PROMPT, "").strip()) and c != "":
             custom_prompt = c
             print(
                 f"Using custom prompt from environment variable '{env_vars.CUSTOM_PROMPT}'"
diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index 98c4be2..b463359 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -15,12 +15,15 @@ class ManuscriptPromptConfig.
 # be ignored
 IGNORE_FILE = "__IGNORE_FILE__"
 
+
 class ManuscriptConfigException(Exception):
     """
     Parent class for exceptions raised by ManuscriptConfig's loading process.
     """
+
     pass
 
+
 class ManuscriptPromptConfig:
     """
     Loads configuration from two YAML files in 'content_dir':
@@ -34,6 +37,7 @@ class ManuscriptPromptConfig:
     which uses both the 'ai_revision-prompts.yaml' and 'ai_revision-config.yaml'
     files to determine the prompt for a given filename.
     """
+
     def __init__(self, content_dir: str, title: str, keywords: str) -> None:
         self.content_dir = Path(content_dir)
         self.config = self._load_config()
@@ -47,7 +51,7 @@ def _load_config(self) -> dict:
         """
         Loads general configuration from ai_revision-config.yaml
         """
-        
+
         config_file_path = os.path.join(self.content_dir, "ai_revision-config.yaml")
 
         try:
@@ -56,7 +60,6 @@ def _load_config(self) -> dict:
         except FileNotFoundError:
             return None
 
-        
     def _load_custom_prompts(self) -> (dict, dict):
         """
         Loads custom prompts from ai_revision-prompts.yaml. The file
@@ -79,22 +82,26 @@ def _load_custom_prompts(self) -> (dict, dict):
             return (None, None)
 
         # validate the existence of at least one of the keys we require
-        if 'prompts' not in data and 'prompts_files' not in data:
-            raise ManuscriptConfigException('The "ai_revision-prompts.yaml" YAML file must contain a "prompts" or a "prompts_files" key.')
+        if "prompts" not in data and "prompts_files" not in data:
+            raise ManuscriptConfigException(
+                'The "ai_revision-prompts.yaml" YAML file must contain a "prompts" or a "prompts_files" key.'
+            )
 
         # if the top-level key was 'prompts', that implies we need the `ai_revision-config.yaml`
         # file to match those prompts to filenames, so raise an exception if it doesn't exist
-        if 'prompts' in data and not self.config:
+        if "prompts" in data and not self.config:
             raise ManuscriptConfigException(
                 'The "ai_revision-config.yaml" YAML file must exist if "ai_revision-prompts.yaml" begins with the "prompts" key.'
             )
 
-        prompts = data.get('prompts')
-        prompts_files = data.get('prompts_files')
+        prompts = data.get("prompts")
+        prompts_files = data.get("prompts_files")
 
         return (prompts, prompts_files)
 
-    def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (Optional[str], Optional[re.Match]):
+    def get_prompt_for_filename(
+        self, filename: str, use_default: bool = True
+    ) -> (Optional[str], Optional[re.Match]):
         """
         Retrieves the prompt for a given filename. It checks the following sources
         for a match in order:
@@ -113,8 +120,8 @@ def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (O
         """
 
         # first, check the ignore list to see if we should bail early
-        for ignore in get_obj_path(self.config, ('files', 'ignore'), missing=[]):
-            if (m := re.search(ignore, filename)):
+        for ignore in get_obj_path(self.config, ("files", "ignore"), missing=[]):
+            if m := re.search(ignore, filename):
                 return (IGNORE_FILE, m)
 
         # FIXME: which takes priority, the files collection in ai_revision-config.yaml
@@ -122,27 +129,36 @@ def get_prompt_for_filename(self, filename: str, use_default: bool = True) -> (O
 
         # then, consult ai_revision-config.yaml's 'matchings' collection if a
         # match is found, use the prompt ai_revision-prompts.yaml
-        for entry in get_obj_path(self.config, ('files', 'matchings'), missing=[]):
+        for entry in get_obj_path(self.config, ("files", "matchings"), missing=[]):
             # iterate through all the 'matchings' entries, trying to find one
             # that matches the current filename
-            for pattern in entry['files']:
-                if (m := re.search(pattern, filename)):
+            for pattern in entry["files"]:
+                if m := re.search(pattern, filename):
                     # since we matched, use the 'prompts' collection to return a
                     # named prompt corresponding to the one from the 'matchings'
                     # collection
                     return (
-                        self.prompts.get(entry['prompt'], None) if self.prompts else None, m
+                        (
+                            self.prompts.get(entry["prompt"], None)
+                            if self.prompts
+                            else None
+                        ),
+                        m,
                     )
 
         # since we haven't found a match yet, consult ai_revision-prompts.yaml's
         # 'prompts_files' collection
         if self.prompts_files:
             for pattern, prompt in self.prompts_files.items():
-                if (m := re.search(pattern, filename)):
+                if m := re.search(pattern, filename):
                     return (prompt if prompt is not None else IGNORE_FILE, m)
 
         # finally, return the default prompt
         return (
-            get_obj_path(self.config, ('files', 'default_prompt')) if use_default else None,
-            None
+            (
+                get_obj_path(self.config, ("files", "default_prompt"))
+                if use_default
+                else None
+            ),
+            None,
         )
diff --git a/libs/manubot_ai_editor/utils.py b/libs/manubot_ai_editor/utils.py
index 1873109..1cee6c6 100644
--- a/libs/manubot_ai_editor/utils.py
+++ b/libs/manubot_ai_editor/utils.py
@@ -25,6 +25,7 @@ def starts_with_similar(string: str, prefix: str, threshold: float = 0.8) -> boo
         difflib.SequenceMatcher(None, prefix, string[: len(prefix)]).ratio() > threshold
     )
 
+
 def get_obj_path(target: any, path: tuple, missing=None):
     """
     Traverse a nested object using a tuple of keys, returning the last resolved
@@ -48,5 +49,5 @@ def get_obj_path(target: any, path: tuple, missing=None):
             target = target[key]
     except (KeyError, IndexError, TypeError):
         return missing
-        
+
     return target
diff --git a/tests/conftest.py b/tests/conftest.py
index f5d2209..01bfc60 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,19 +10,24 @@
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--runcost", action="store_true", default=False, help="run tests that can incur API usage costs"
+        "--runcost",
+        action="store_true",
+        default=False,
+        help="run tests that can incur API usage costs",
     )
 
 
 def pytest_configure(config):
-    config.addinivalue_line("markers", "cost: mark test as possibly costing money to run")
+    config.addinivalue_line(
+        "markers", "cost: mark test as possibly costing money to run"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
     if config.getoption("--runcost"):
         # --runcost given in cli: do not skip cost tests
         return
-    
+
     skip_cost = pytest.mark.skip(reason="need --runcost option to run")
 
     for item in items:
diff --git a/tests/test_model_revision.py b/tests/test_model_revision.py
index 942174b..e9ff26f 100644
--- a/tests/test_model_revision.py
+++ b/tests/test_model_revision.py
@@ -1,7 +1,7 @@
 """
 These tests need to call the OpenAI API, so they are in a separate file and can incur costs.
 """
-import difflib
+
 from unittest import mock
 
 import pytest
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index c3d7c25..5b4863e 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -1,4 +1,3 @@
-
 from pathlib import Path
 from unittest import mock
 
@@ -11,11 +10,13 @@
 
 MANUSCRIPTS_DIR = Path(__file__).parent / "manuscripts" / "phenoplier_full"
 
+
 # check that this path exists and resolve it
 def test_manuscripts_dir_exists():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
     assert content_dir.exists()
 
+
 # check that we can create a ManuscriptEditor object
 def test_create_manuscript_editor():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
@@ -26,10 +27,13 @@ def test_create_manuscript_editor():
 # ==============================================================================
 # === prompts tests, using ai_revision-config.yaml + ai_revision-prompts.yaml
 # ==============================================================================
-    
+
 # contains standard prompt, config files for phenoplier_full
 # (this is merged into the manuscript folder using the mock_unify_open mock)
-PHENOPLIER_PROMPTS_DIR = Path(__file__).parent / "config_loader_fixtures" / "phenoplier_full"
+PHENOPLIER_PROMPTS_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "phenoplier_full"
+)
+
 
 # check that we can resolve a file to a prompt, and that it's the correct prompt
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PHENOPLIER_PROMPTS_DIR))
@@ -39,27 +43,35 @@ def test_resolve_prompt():
 
     phenoplier_files_matches = {
         # explicitly ignored in ai_revision-config.yaml
-        '00.front-matter.md': (IGNORE_FILE, 'front-matter'),
-
+        "00.front-matter.md": (IGNORE_FILE, "front-matter"),
         # prompts that match a part of the filename
-        '01.abstract.md': ('Test match abstract.\n', 'abstract'),
-        '02.introduction.md': ('Test match introduction or discussion.\n', 'introduction'),
-
+        "01.abstract.md": ("Test match abstract.\n", "abstract"),
+        "02.introduction.md": (
+            "Test match introduction or discussion.\n",
+            "introduction",
+        ),
         # these all match the regex 04\..+\.md, hence why the match object includes a suffix
-        '04.00.results.md': ('Test match results.\n', '04.00.results.md'),
-        '04.05.00.results_framework.md': ('Test match results.\n', '04.05.00.results_framework.md'),
-        '04.05.01.crispr.md': ('Test match results.\n', '04.05.01.crispr.md'),
-        '04.15.drug_disease_prediction.md': ('Test match results.\n', '04.15.drug_disease_prediction.md'),
-        '04.20.00.traits_clustering.md': ('Test match results.\n', '04.20.00.traits_clustering.md'),
-
+        "04.00.results.md": ("Test match results.\n", "04.00.results.md"),
+        "04.05.00.results_framework.md": (
+            "Test match results.\n",
+            "04.05.00.results_framework.md",
+        ),
+        "04.05.01.crispr.md": ("Test match results.\n", "04.05.01.crispr.md"),
+        "04.15.drug_disease_prediction.md": (
+            "Test match results.\n",
+            "04.15.drug_disease_prediction.md",
+        ),
+        "04.20.00.traits_clustering.md": (
+            "Test match results.\n",
+            "04.20.00.traits_clustering.md",
+        ),
         # more prompts that match a part of the filename
-        '05.discussion.md': ('Test match introduction or discussion.\n', 'discussion'),
-        '07.00.methods.md': ('Test match methods.\n', 'methods'),
-
+        "05.discussion.md": ("Test match introduction or discussion.\n", "discussion"),
+        "07.00.methods.md": ("Test match methods.\n", "methods"),
         # these are all explicitly ignored in ai_revision-config.yaml
-        '10.references.md': (IGNORE_FILE, 'references'),
-        '15.acknowledgements.md': (IGNORE_FILE, 'acknowledgements'),
-        '50.00.supplementary_material.md': (IGNORE_FILE, 'supplementary_material')
+        "10.references.md": (IGNORE_FILE, "references"),
+        "15.acknowledgements.md": (IGNORE_FILE, "acknowledgements"),
+        "50.00.supplementary_material.md": (IGNORE_FILE, "supplementary_material"),
     }
 
     for filename, (expected_prompt, expected_match) in phenoplier_files_matches.items():
@@ -76,7 +88,8 @@ def test_resolve_prompt():
         if expected_match is None:
             assert match is None
         else:
-            assert match.string[match.start():match.end()] == expected_match
+            assert match.string[match.start() : match.end()] == expected_match
+
 
 # test that we get the default prompt with a None match object for a
 # file we don't recognize
@@ -90,6 +103,7 @@ def test_resolve_default_prompt_unknown_file():
     assert prompt.strip() == """default prompt text"""
     assert match is None
 
+
 # check that a file we don't recognize gets match==None and the 'default' prompt
 # from the ai_revision-config.yaml file
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PHENOPLIER_PROMPTS_DIR))
@@ -103,6 +117,7 @@ def test_unresolved_gets_default_prompt():
 
     assert prompt.strip() == """default prompt text"""
 
+
 # ==============================================================================
 # === prompts_files tests, using ai_revision-prompts.yaml w/ai_revision-config.yaml to process ignores, defaults
 # ==============================================================================
@@ -111,43 +126,53 @@ def test_unresolved_gets_default_prompt():
 # https://github.com/manubot/manubot-ai-editor/issues/31
 # we test four different scenarios from ./config_loader_fixtures:
 # - Only ai_revision-prompts.yaml is defined (only_revision_prompts)
-ONLY_REV_PROMPTS_DIR = Path(__file__).parent / "config_loader_fixtures" / "only_revision_prompts"
+ONLY_REV_PROMPTS_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "only_revision_prompts"
+)
 # - Both ai_revision-prompts.yaml and ai_revision-config.yaml are defined (both_prompts_config)
-BOTH_PROMPTS_CONFIG_DIR = Path(__file__).parent / "config_loader_fixtures" / "both_prompts_config"
+BOTH_PROMPTS_CONFIG_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "both_prompts_config"
+)
 # - Only a single, generic prompt is defined (single_generic_prompt)
-SINGLE_GENERIC_PROMPT_DIR = Path(__file__).parent / "config_loader_fixtures" / "single_generic_prompt"
+SINGLE_GENERIC_PROMPT_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "single_generic_prompt"
+)
 
 # ---
 # test ManuscriptEditor.prompt_config sub-attributes are set correctly
 # ---
 
+
 def get_editor():
     content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
     editor = ManuscriptEditor(content_dir)
     assert isinstance(editor, ManuscriptEditor)
     return editor
 
+
 def test_no_config_unloaded():
     """
     With no config files defined, the ManuscriptPromptConfig object should
     have its attributes set to None.
     """
     editor = get_editor()
-    
+
     # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
     assert editor.prompt_config.prompts is None
     assert editor.prompt_config.prompts_files is None
     assert editor.prompt_config.config is None
 
+
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, ONLY_REV_PROMPTS_DIR))
 def test_only_rev_prompts_loaded():
     editor = get_editor()
 
     # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
-    assert editor.prompt_config.prompts is  None
+    assert editor.prompt_config.prompts is None
     assert editor.prompt_config.prompts_files is not None
     assert editor.prompt_config.config is None
 
+
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, BOTH_PROMPTS_CONFIG_DIR))
 def test_both_prompts_loaded():
     editor = get_editor()
@@ -157,7 +182,10 @@ def test_both_prompts_loaded():
     assert editor.prompt_config.prompts_files is None
     assert editor.prompt_config.config is not None
 
-@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, SINGLE_GENERIC_PROMPT_DIR))
+
+@mock.patch(
+    "builtins.open", mock_unify_open(MANUSCRIPTS_DIR, SINGLE_GENERIC_PROMPT_DIR)
+)
 def test_single_generic_loaded():
     editor = get_editor()
 
@@ -166,14 +194,16 @@ def test_single_generic_loaded():
     assert editor.prompt_config.prompts_files is not None
     assert editor.prompt_config.config is not None
 
+
 # ---
 # test that ignored files are ignored in applicable scenarios
 # ---
-    
+
 # places in configs where files can be ignored:
 # ai_revision-config.yaml: the `files.ignore` key
 # ai_revision-prompts.yaml: when a prompt in `prompts_files` has a value of null
 
+
 @pytest.mark.parametrize(
     "model",
     [
diff --git a/tests/utils/dir_union.py b/tests/utils/dir_union.py
index bb513a6..8d8f804 100644
--- a/tests/utils/dir_union.py
+++ b/tests/utils/dir_union.py
@@ -1,9 +1,9 @@
 import os
 from pathlib import Path
-from unittest import mock
 
 from contextlib import contextmanager
 
+
 @contextmanager
 def set_directory(new):
     """
@@ -13,7 +13,7 @@ def set_directory(new):
     Note that if we upgrade to Python 3.11, this method can be replaced
     with https://docs.python.org/3/library/contextlib.html#contextlib.chdir
     """
-    
+
     # store the current path so we can return to it
     original = Path().absolute()
 
@@ -23,6 +23,7 @@ def set_directory(new):
     finally:
         os.chdir(original)
 
+
 def mock_unify_open(original, patched):
     """
     Given paths to an 'original' and 'patched' folder,
@@ -38,12 +39,8 @@ def unify_open(*args, **kwargs):
 
             # resolve all paths: the original, patched, and requested file
             target_full_path = Path(args[0]).absolute()
-            rewritten_path = (
-                str(target_full_path)
-                    .replace(
-                        str(original.absolute()),
-                        str(patched.absolute())
-                    )
+            rewritten_path = str(target_full_path).replace(
+                str(original.absolute()), str(patched.absolute())
             )
 
             return builtin_open(rewritten_path, *(args[1:]), **kwargs)
diff --git a/tests/utils/test_dir_union.py b/tests/utils/test_dir_union.py
index 0a21d46..b7a42d4 100644
--- a/tests/utils/test_dir_union.py
+++ b/tests/utils/test_dir_union.py
@@ -9,6 +9,7 @@
 UNIFY_ORIG_DIR = UNIFY_TEST_DIR / "original"
 UNIFY_PATCHED_DIR = UNIFY_TEST_DIR / "patched"
 
+
 @mock.patch("builtins.open", mock_unify_open(UNIFY_ORIG_DIR, UNIFY_PATCHED_DIR))
 def test_unify_folder_mock():
     # test that we can still open files in the original folder
@@ -18,6 +19,7 @@ def test_unify_folder_mock():
     with open(UNIFY_ORIG_DIR / "another.txt") as fp:
         assert fp.read().strip() == "patched in via unify mock"
 
+
 @mock.patch("builtins.open", mock_unify_open(UNIFY_ORIG_DIR, UNIFY_PATCHED_DIR))
 def test_unify_folder_mock_relative_paths():
     with set_directory(UNIFY_ORIG_DIR):

From 21f3c3ad1306c95b2fe7587763cbc9d8f16b0647 Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Fri, 2 Feb 2024 15:06:47 -0700
Subject: [PATCH 09/44] remove old test file

we are doing prompt testing separately now
---
 tests/test_model_revision.py | 1128 ----------------------------------
 1 file changed, 1128 deletions(-)
 delete mode 100644 tests/test_model_revision.py

diff --git a/tests/test_model_revision.py b/tests/test_model_revision.py
deleted file mode 100644
index e9ff26f..0000000
--- a/tests/test_model_revision.py
+++ /dev/null
@@ -1,1128 +0,0 @@
-"""
-These tests need to call the OpenAI API, so they are in a separate file and can incur costs.
-"""
-
-from unittest import mock
-
-import pytest
-
-from manubot_ai_editor import env_vars
-from manubot_ai_editor.editor import ManuscriptEditor
-from manubot_ai_editor.models import GPT3CompletionModel
-from manubot_ai_editor.utils import starts_with_similar
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_abstract_ccc(model):
-    # from CCC manuscript
-    paragraph = r"""
-Correlation coefficients are widely used to identify patterns in data that may be of particular interest.
-In transcriptomics, genes with correlated expression often share functions or are part of disease-relevant biological processes.
-Here we introduce the Clustermatch Correlation Coefficient (CCC), an efficient, easy-to-use and not-only-linear coefficient based on machine learning models.
-CCC reveals biologically meaningful linear and nonlinear patterns missed by standard, linear-only correlation coefficients.
-CCC captures general patterns in data by comparing clustering solutions while being much faster than state-of-the-art coefficients such as the Maximal Information Coefficient.
-When applied to human gene expression data, CCC identifies robust linear relationships while detecting nonlinear patterns associated, for example, with sex differences that are not captured by linear-only coefficients.
-Gene pairs highly ranked by CCC were enriched for interactions in integrated networks built from protein-protein interaction, transcription factor regulation, and chemical and genetic perturbations, suggesting that CCC could detect functional relationships that linear-only methods missed.
-CCC is a highly-efficient, next-generation not-only-linear correlation coefficient that can readily be applied to genome-scale data and other domains across different data types.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 8
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "abstract"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # # original and revised paragraph should be quite different
-    # _ratio = difflib.SequenceMatcher(lambda x: x in (" ", "\n",), paragraph_text, paragraph_revised).ratio()
-    # assert _ratio < 0.10 if model.endpoint != "edits" else 1.0, _ratio
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[" not in paragraph_revised
-    assert "@" not in paragraph_revised
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@mock.patch.dict(
-    "os.environ",
-    {env_vars.CUSTOM_PROMPT: "proofread the following paragraph"},
-)
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_abstract_ccc_with_custom_prompt(model):
-    # from CCC manuscript
-    paragraph = r"""
-Correlation coefficients are widely used to identify patterns in data that may be of particular interest.
-In transcriptomics, genes with correlated expression often share functions or are part of disease-relevant biological processes.
-Here we introduce the Clustermatch Correlation Coefficient (CCC), an efficient, easy-to-use and not-only-linear coefficient based on machine learning models.
-CCC reveals biologically meaningful linear and nonlinear patterns missed by standard, linear-only correlation coefficients.
-CCC captures general patterns in data by comparing clustering solutions while being much faster than state-of-the-art coefficients such as the Maximal Information Coefficient.
-When applied to human gene expression data, CCC identifies robust linear relationships while detecting nonlinear patterns associated, for example, with sex differences that are not captured by linear-only coefficients.
-Gene pairs highly ranked by CCC were enriched for interactions in integrated networks built from protein-protein interaction, transcription factor regulation, and chemical and genetic perturbations, suggesting that CCC could detect functional relationships that linear-only methods missed.
-CCC is a highly-efficient, next-generation not-only-linear correlation coefficient that can readily be applied to genome-scale data and other domains across different data types.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 8
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "abstract"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # # since the custom prompt also "proofreads", the similarity between input and revised text should be very high
-    # _ratio = difflib.SequenceMatcher(lambda x: x in (" ", "\n",), paragraph_text, paragraph_revised).ratio()
-    # assert _ratio > 0.50, _ratio
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[" not in paragraph_revised
-    assert "@" not in paragraph_revised
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_abstract_phenoplier(model):
-    # from PhenoPLIER manuscript
-    paragraph = r"""
-Genes act in concert with each other in specific contexts to perform their functions.
-Determining how these genes influence complex traits requires a mechanistic understanding of expression regulation across different conditions.
-It has been shown that this insight is critical for developing new therapies.
-In this regard, the role of individual genes in disease-relevant mechanisms can be hypothesized with transcriptome-wide association studies (TWAS), which have represented a significant step forward in testing the mediating role of gene expression in GWAS associations.
-However, modern models of the architecture of complex traits predict that gene-gene interactions play a crucial role in disease origin and progression.
-Here we introduce PhenoPLIER, a computational approach that maps gene-trait associations and pharmacological perturbation data into a common latent representation for a joint analysis.
-This representation is based on modules of genes with similar expression patterns across the same conditions.
-We observed that diseases were significantly associated with gene modules expressed in relevant cell types, and our approach was accurate in predicting known drug-disease pairs and inferring mechanisms of action.
-Furthermore, using a CRISPR screen to analyze lipid regulation, we found that functionally important players lacked TWAS associations but were prioritized in trait-associated modules by PhenoPLIER.
-By incorporating groups of co-expressed genes, PhenoPLIER can contextualize genetic associations and reveal potential targets missed by single-gene strategies.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 10
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "abstract"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[" not in paragraph_revised
-    assert "@" not in paragraph_revised
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_abstract_ai_revision(model):
-    # from LLM for articles revision manuscript
-    paragraph = r"""
-Academics often communicate through scholarly manuscripts.
-These manuscripts describe new advances, summarize existing literature, or argue for changes in the status quo.
-Writing and revising manuscripts can be a time-consuming process.
-Large language models are bringing new capabilities to many areas of knowledge work.
-We integrated the use of large language models into the Manubot publishing ecosystem.
-Users of Manubot can run a workflow, which will trigger a series of queries to OpenAI's language models, produce revisions, and create a timestamped set of suggested revisions.
-Given the amount of time that researchers put into crafting prose, we expect this advance to radically transform the type of knowledge work that academics perform.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = "A publishing infrastructure for AI-assisted academic authoring"
-    model.keywords = [
-        "manubot",
-        "artificial intelligence",
-        "scholarly publishing",
-        "software",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "abstract"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[" not in paragraph_revised
-    assert "@" not in paragraph_revised
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_introduction_paragraph_with_single_and_multiple_citations_together(
-    model,
-):
-    # from CCC manuscript
-    paragraph = r"""
-In transcriptomics, many analyses start with estimating the correlation between genes.
-More sophisticated approaches built on correlation analysis can suggest gene function [@pmid:21241896], aid in discovering common and cell lineage-specific regulatory networks [@pmid:25915600], and capture important interactions in a living organism that can uncover molecular mechanisms in other species [@pmid:21606319; @pmid:16968540].
-The analysis of large RNA-seq datasets [@pmid:32913098; @pmid:34844637] can also reveal complex transcriptional mechanisms underlying human diseases [@pmid:27479844; @pmid:31121115; @pmid:30668570; @pmid:32424349; @pmid:34475573].
-Since the introduction of the omnigenic model of complex traits [@pmid:28622505; @pmid:31051098], gene-gene relationships are playing an increasingly important role in genetic studies of human diseases [@pmid:34845454; @doi:10.1101/2021.07.05.450786; @doi:10.1101/2021.10.21.21265342; @doi:10.1038/s41588-021-00913-z], even in specific fields such as polygenic risk scores [@doi:10.1016/j.ajhg.2021.07.003].
-In this context, recent approaches combine disease-associated genes from genome-wide association studies (GWAS) with gene co-expression networks to prioritize "core" genes directly affecting diseases [@doi:10.1186/s13040-020-00216-9; @doi:10.1101/2021.07.05.450786; @doi:10.1101/2021.10.21.21265342].
-These core genes are not captured by standard statistical methods but are believed to be part of highly-interconnected, disease-relevant regulatory networks.
-Therefore, advanced correlation coefficients could immediately find wide applications across many areas of biology, including the prioritization of candidate drug targets in the precision medicine field.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "introduction"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[@" in paragraph_revised
-    assert paragraph_revised.count("@") >= int(paragraph_text.count("@") * 0.50)
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_introduction_paragraph_with_citations_and_paragraph_is_the_first(model):
-    # from PhenoPLIER manuscript
-    paragraph = r"""
-Genes work together in context-specific networks to carry out different functions [@pmid:19104045; @doi:10.1038/ng.3259].
-Variations in these genes can change their functional role and, at a higher level, affect disease-relevant biological processes [@doi:10.1038/s41467-018-06022-6].
-In this context, determining how genes influence complex traits requires mechanistically understanding expression regulation across different cell types [@doi:10.1126/science.aaz1776; @doi:10.1038/s41586-020-2559-3; @doi:10.1038/s41576-019-0200-9], which in turn should lead to improved treatments [@doi:10.1038/ng.3314; @doi:10.1371/journal.pgen.1008489].
-Previous studies have described different regulatory DNA elements [@doi:10.1038/nature11247; @doi:10.1038/nature14248; @doi:10.1038/nature12787; @doi:10.1038/s41586-020-03145-z; @doi:10.1038/s41586-020-2559-3] including genetic effects on gene expression across different tissues [@doi:10.1126/science.aaz1776].
-Integrating functional genomics data and GWAS data [@doi:10.1038/s41588-018-0081-4; @doi:10.1016/j.ajhg.2018.04.002; @doi:10.1038/s41588-018-0081-4; @doi:10.1038/ncomms6890] has improved the identification of these transcriptional mechanisms that, when dysregulated, commonly result in tissue- and cell lineage-specific pathology [@pmid:20624743; @pmid:14707169; @doi:10.1073/pnas.0810772105].
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 5
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "introduction"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[@" in paragraph_revised
-    assert paragraph_revised.count("@") >= int(paragraph_text.count("@") * 0.50)
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_introduction_paragraph_with_citations_and_paragraph_is_the_last(model):
-    # from LLM for articles revision manuscript
-    paragraph = r"""
-We developed a software publishing platform that imagines a future where authors co-write their manuscripts with the support of large language models.
-We used, as a base, the Manubot platform for scholarly publishing [@doi:10.1371/journal.pcbi.1007128].
-Manubot was designed as an end-to-end publishing platform for scholarly writing for both individual and large-collaborative projects.
-It has been used for collaborations of approximately 50 authors writing hundreds of pages of text reviewing progress during the COVID19 pandemic [@pmid:34545336].
-We developed a new workflow that parses the manuscript, uses a large language model with section-specific custom prompts to revise the manuscript, and then creates a set of suggested changes to reach the revised state.
-Changes are presented to the user through the GitHub interface for author review and integration into the published document.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 6
-
-    model.title = "A publishing infrastructure for AI-assisted academic authoring"
-    model.keywords = [
-        "manubot",
-        "artificial intelligence",
-        "scholarly publishing",
-        "software",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "introduction"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 25
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # most citations were kept in the revised text
-    assert "[@" in paragraph_revised
-    assert paragraph_revised.count("@") >= int(paragraph_text.count("@") * 0.50)
-
-    # no references to figures or tables
-    assert "Figure" not in paragraph_revised
-    assert "Table" not in paragraph_revised
-
-    # no math
-    assert "$" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_results_paragraph_with_short_inline_formulas_and_refs_to_figures_and_citations(
-    model,
-):
-    # from CCC manuscript
-    paragraph = r"""
-We examined how the Pearson ($p$), Spearman ($s$) and CCC ($c$) correlation coefficients behaved on different simulated data patterns.
-In the first row of Figure @fig:datasets_rel, we examine the classic Anscombe's quartet [@doi:10.1080/00031305.1973.10478966], which comprises four synthetic datasets with different patterns but the same data statistics (mean, standard deviation and Pearson's correlation).
-This kind of simulated data, recently revisited with the "Datasaurus" [@url:http://www.thefunctionalart.com/2016/08/download-datasaurus-never-trust-summary.html; @doi:10.1145/3025453.3025912; @doi:10.1111/dsji.12233], is used as a reminder of the importance of going beyond simple statistics, where either undesirable patterns (such as outliers) or desirable ones (such as biologically meaningful nonlinear relationships) can be masked by summary statistics alone.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 3
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "results"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # some citations were kept in the revised text
-    assert "[@" in paragraph_revised
-
-    # references to figures were kept
-    assert "Figure @fig:datasets_rel" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_results_paragraph_with_lists_and_refs_to_sections_and_subfigs(model):
-    # from PhenoPLIER manuscript
-    paragraph = r"""
-PhenoPLIER is a flexible computational framework that combines gene-trait and gene-drug associations with gene modules expressed in specific contexts (Figure {@fig:entire_process}a).
-The approach uses a latent representation (with latent variables or LVs representing gene modules) derived from a large gene expression compendium (Figure {@fig:entire_process}b, top) to integrate TWAS with drug-induced transcriptional responses (Figure {@fig:entire_process}b, bottom) for a joint analysis.
-The approach consists in three main components (Figure {@fig:entire_process}b, middle, see [Methods](#sec:methods)):
-1) an LV-based regression model to compute an association between an LV and a trait,
-2) a clustering framework to learn groups of traits with shared transcriptomic properties,
-and 3) an LV-based drug repurposing approach that links diseases to potential treatments.
-We performed extensive simulations for our regression model ([Supplementary Note 1](#sm:reg:null_sim)) and clustering framework ([Supplementary Note 2](#sm:clustering:null_sim)) to ensure proper calibration and expected results under a model of no association.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "results"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # list was kept
-    assert "1)" in paragraph_revised
-    assert "2)" in paragraph_revised
-    assert "3)" in paragraph_revised
-
-    # references to sub figures were kept
-    assert "Figure {@fig:entire_process}a" in paragraph_revised
-    assert "Figure {@fig:entire_process}b" in paragraph_revised
-
-    # ferences to sections were kept
-    assert "[Supplementary Note 1](#sm:reg:null_sim)" in paragraph_revised
-    assert "[Supplementary Note 2](#sm:clustering:null_sim)" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_results_paragraph_is_too_long(model):
-    # from CCC manuscript
-    paragraph = r"""
-We sought to systematically analyze discrepant scores to assess whether associations were replicated in other datasets besides GTEx.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 1
-
-    paragraph = paragraph * 200
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "results"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-
-    # if there is an error, it should return the original paragraph with a header specifying the error
-    error_message = r"""
-<!--
-ERROR: the paragraph below could not be revised with the AI model due to the following error:
-
-This model's maximum context length is 4097 tokens, however you requested 17570 tokens (4272 in your prompt; 13298 for the completion). Please reduce your prompt; or completion length.
--->
-    """.strip()
-    assert starts_with_similar(
-        paragraph_revised, error_message, 0.55 if not model.edit_endpoint else 0.30
-    )
-
-    # remove the multiline html comment at the top of the revised paragraph
-    paragraph_revised_without_error = paragraph_revised.split("-->\n")[1].strip()
-    assert "\n".join(paragraph) == paragraph_revised_without_error
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_discussion_paragraph_with_markdown_formatting_and_citations(model):
-    # from CCC manuscript
-    paragraph = r"""
-It is well-known that biomedical research is biased towards a small fraction of human genes [@pmid:17620606; @pmid:17472739].
-Some genes highlighted in CCC-ranked pairs (Figure @fig:upsetplot_coefs b), such as *SDS* (12q24) and *ZDHHC12* (9q34), were previously found to be the focus of fewer than expected publications [@pmid:30226837].
-It is possible that the widespread use of linear coefficients may bias researchers away from genes with complex coexpression patterns.
-A beyond-linear gene co-expression analysis on large compendia might shed light on the function of understudied genes.
-For example, gene *KLHL21* (1p36) and *AC068580.6* (*ENSG00000235027*, in 11p15) have a high CCC value and are missed by the other coefficients.
-*KLHL21* was suggested as a potential therapeutic target for hepatocellular carcinoma [@pmid:27769251] and other cancers [@pmid:29574153; @pmid:35084622].
-Its nonlinear correlation with *AC068580.6* might unveil other important players in cancer initiation or progression, potentially in subsets of samples with specific characteristics (as suggested in Figure @fig:upsetplot_coefs b).
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "discussion"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # some citations were kept in the revised text
-    assert "[@" in paragraph_revised
-
-    # Markdown formatting was kept in the revised text
-    assert "*" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_discussion_paragraph_with_minor_math_and_refs_to_sections_and_websites(
-    model,
-):
-    # from PhenoPLIER manuscript
-    paragraph = r"""
-Finally, we developed an LV-based regression framework to detect whether gene modules are associated with a trait using TWAS $p$-values.
-We used PhenomeXcan as a discovery cohort across four thousand traits, and many LV-trait associations replicated in eMERGE.
-In PhenomeXcan, we found 3,450 significant LV-trait associations (FDR < 0.05) with 686 LVs (out of 987) associated with at least one trait and 1,176 traits associated with at least one LV.
-In eMERGE, we found 196 significant LV-trait associations, with 116 LVs associated with at least one trait/phecode and 81 traits with at least one LV.
-We only focused on a few disease types from our trait clusters, but the complete set of associations on other disease domains is available in our [Github repository](https://github.com/greenelab/phenoplier) for future research.
-As noted in [Methods](#sec:methods:reg), one limitation of the regression approach is that the gene-gene correlations are only approximately accurate, which could lead to false positives if the correlation among the top genes in a module is not precisely captured.
-The regression model, however, is approximately well-calibrated, and we did not observe inflation when running the method in real data.
-        """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "discussion"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # equations or minor math were kept in the revised text
-    # assert "$" in paragraph_revised
-    assert "FDR < 0.05" in paragraph_revised
-
-    # refs to external websites
-    assert (
-        "[Github repository](https://github.com/greenelab/phenoplier)"
-        in paragraph_revised
-    )
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_conclusions_paragraph_with_simple_text(model):
-    # conclusions is the same as discussion in CCC/PhenoPLIER
-
-    # from LLM for articles revision manuscript
-    paragraph = r"""
-We implemented AI-based models into publishing infrastructure.
-While most manuscripts have been written by humans, the process is time consuming and academic writing can be difficult to parse.
-We sought to develop a technology that academics could use to make their writing more understandable without changing the fundamental meaning.
-This work lays the foundation for a future where academic manuscripts are constructed by a process that incorporates both human and machine authors.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 4
-
-    model.title = "A publishing infrastructure for AI-assisted academic authoring"
-    model.keywords = [
-        "manubot",
-        "artificial intelligence",
-        "scholarly publishing",
-        "software",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "conclusions"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # revised text does not have math or references
-    assert "$" not in paragraph_revised
-    assert "[" not in paragraph_revised
-    assert "@" not in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_methods_paragraph_with_inline_equations_and_figure_refs(model):
-    # from CCC manuscript
-    paragraph = r"""
-The Clustermatch Correlation Coefficient (CCC) computes a similarity value $c \in \left[0,1\right]$ between any pair of numerical or categorical features/variables $\mathbf{x}$ and $\mathbf{y}$ measured on $n$ objects.
-CCC assumes that if two features $\mathbf{x}$ and $\mathbf{y}$ are similar, then the partitioning by clustering of the $n$ objects using each feature separately should match.
-For example, given $\mathbf{x}=(11, 27, 32, 40)$ and $\mathbf{y}=10x=(110, 270, 320, 400)$, where $n=4$, partitioning each variable into two clusters ($k=2$) using their medians (29.5 for $\mathbf{x}$ and 295 for $\mathbf{y}$) would result in partition $\Omega^{\mathbf{x}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{x}$, and partition $\Omega^{\mathbf{y}}_{k=2}=(1, 1, 2, 2)$ for $\mathbf{y}$.
-Then, the agreement between $\Omega^{\mathbf{x}}_{k=2}$ and $\Omega^{\mathbf{y}}_{k=2}$ can be computed using any measure of similarity between partitions, like the adjusted Rand index (ARI) [@doi:10.1007/BF01908075].
-In that case, it will return the maximum value (1.0 in the case of ARI).
-Note that the same value of $k$ might not be the right one to find a relationship between any two features.
-For instance, in the quadratic example in Figure @fig:datasets_rel, CCC returns a value of 0.36 (grouping objects in four clusters using one feature and two using the other).
-If we used only two clusters instead, CCC would return a similarity value of 0.02.
-Therefore, the CCC algorithm (shown below) searches for this optimal number of clusters given a maximum $k$, which is its single parameter $k_{\mathrm{max}}$.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 9
-
-    model.title = (
-        "An efficient not-only-linear correlation coefficient based on machine learning"
-    )
-    model.keywords = [
-        "correlation coefficient",
-        "nonlinear relationships",
-        "gene expression",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "methods"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # some formulas are referenced in the revised text
-    assert "$" in paragraph_revised
-
-    # some figures are referenced in the revised text
-    assert "Figure @fig:datasets_rel" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_methods_paragraph_with_figure_table_and_equation_refs(model):
-    # from PhenoPLIER manuscript:
-    paragraph = r"""
-Note that, since we used the MultiXcan regression model (Equation (@eq:multixcan)), $\mathbf{R}$ is only an approximation of gene correlations in S-MultiXcan.
-As explained before, S-MultiXcan approximates the joint regression parameters in MultiXcan using the marginal regression estimates from S-PrediXcan in (@eq:spredixcan) with some simplifying assumptions and different genotype covariance matrices.
-This complicates the derivation of an S-MultiXcan-specific solution to compute $\mathbf{R}$.
-To account for this, we used a submatrix $\mathbf{R}_{\ell}$ corresponding to genes that are part of LV $\ell$ only (top 1% of genes) instead of the entire matrix $\mathbf{R}$.
-This simplification is conservative since correlations are accounted for top genes only.
-Our simulations ([Supplementary Note 1](#sm:reg:null_sim)) show that the model is approximately well-calibrated and can correct for LVs with adjacent and highly correlated genes at the top (e.g., Figure @fig:reg:nulls:qqplot:lv234).
-The model can also detect LVs associated with relevant traits (Figure @fig:lv246 and Table @tbl:sup:phenomexcan_assocs:lv246) that are replicated in a different cohort (Table @tbl:sup:emerge_assocs:lv246).
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 7
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "methods"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # some equations are referenced in the revised text
-    assert ("Equation (@eq:multixcan)" in paragraph_revised) or (
-        "Equation (@eq:spredixcan)" in paragraph_revised
-    )
-
-    # some figures/tables are referenced in the revised text
-    assert "Figure @fig:lv246" in paragraph_revised
-    assert "Table @tbl:sup:phenomexcan_assocs:lv246" in paragraph_revised
-    assert "Table @tbl:sup:emerge_assocs:lv246" in paragraph_revised
-
-    # reference to important sections
-    assert "[Supplementary Note 1](#sm:reg:null_sim)" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_methods_paragraph_with_inline_math_and_equations(model):
-    # from PhenoPLIER manuscript:
-    paragraph = r"""
-S-PrediXcan [@doi:10.1038/s41467-018-03621-1] is the summary version of PrediXcan [@doi:10.1038/ng.3367].
-PrediXcan models the trait as a linear function of the gene's expression on a single tissue using the univariate model
-
-$$
-\mathbf{y} = \mathbf{t}_l \gamma_l + \bm{\epsilon}_l,
-$$ {#eq:predixcan}
-
-where $\hat{\gamma}_l$ is the estimated effect size or regression coefficient, and $\bm{\epsilon}_l$ are the error terms with variance $\sigma_{\epsilon}^{2}$.
-The significance of the association is assessed by computing the $z$-score $\hat{z}_{l}=\hat{\gamma}_l / \mathrm{se}(\hat{\gamma}_l)$ for a gene's tissue model $l$.
-PrediXcan needs individual-level data to fit this model, whereas S-PrediXcan approximates PrediXcan $z$-scores using only GWAS summary statistics with the expression
-
-$$
-\hat{z}_{l} \approx \sum_{a \in model_{l}} w_a^l \frac{\hat{\sigma}_a}{\hat{\sigma}_l} \frac{\hat{\beta}_a}{\mathrm{se}(\hat{\beta}_a)},
-$$ {#eq:spredixcan}
-
-where $\hat{\sigma}_a$ is the variance of SNP $a$, $\hat{\sigma}_l$ is the variance of the predicted expression of a gene in tissue $l$, and $\hat{\beta}_a$ is the estimated effect size of SNP $a$ from the GWAS.
-In these TWAS methods, the genotype variances and covariances are always estimated using the Genotype-Tissue Expression project (GTEx v8) [@doi:10.1126/science.aaz1776] as the reference panel.
-Since S-PrediXcan provides tissue-specific direction of effects (for instance, whether a higher or lower predicted expression of a gene confers more or less disease risk), we used the $z$-scores in our drug repurposing approach (described below).
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 18
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "methods"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    # some equations are referenced in the revised text
-    assert "$$ {#eq:predixcan}" in paragraph_revised
-    assert "$$ {#eq:spredixcan}" in paragraph_revised
-    assert "$\hat{\sigma}_a$" in paragraph_revised
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_methods_paragraph_without_fig_table_reference(model):
-    # from LLM for articles revision manuscript
-    paragraph = r"""
-We used the OpenAI API for access to large language models, with a focus on the completion endpoints.
-This API incurs a cost with each run that depends on manuscript length.
-Because of this cost, we implemented our workflow in GitHub actions, making it triggerable by the user.
-The user can select the model that they wish to use, allowing costs to be tuned.
-With the most complex model, `text-davinci-003`, the cost per run is under $0.50 for many manuscripts.
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 5
-
-    model.title = "A publishing infrastructure for AI-assisted academic authoring"
-    model.keywords = [
-        "manubot",
-        "artificial intelligence",
-        "scholarly publishing",
-        "software",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "methods"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert paragraph_revised[-1] == "."
-
-    assert "`text-davinci-003`" in paragraph_revised
-
-    # no figures/tables are referenced in the revised text
-    assert "figure" not in paragraph_revised.lower()
-    assert "table" not in paragraph_revised.lower()
-    assert "@" not in paragraph_revised.lower()
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(None, None),
-        GPT3CompletionModel(None, None, model_engine="text-davinci-edit-001"),
-        GPT3CompletionModel(None, None, model_engine="gpt-3.5-turbo"),
-    ],
-)
-@pytest.mark.cost
-def test_revise_methods_paragraph_with_many_tokens(model):
-    # from PhenoPLIER manuscript:
-    paragraph = r"""
-Since the error terms $\bm{\epsilon}$ could be correlated, we cannot assume they have independent normal distributions as in a standard linear regression model.
-In the PrediXcan family of methods, the predicted expression of a pair of genes could be correlated if they share eQTLs or if these are in LD [@doi:10.1038/s41588-019-0385-z].
-Therefore, we used a generalized least squares approach to account for these correlations.
-The gene-gene correlation matrix $\mathbf{R}$ was approximated by computing the correlations between the model sum of squares (SSM) for each pair of genes under the null hypothesis of no association.
-These correlations are derived from the individual-level MultiXcan model (Equation (@eq:multixcan)), where the predicted expression matrix $\mathbf{T}_{i} \in \mathbb{R}^{n \times p_i}$ of a gene $i$ across $p_i$ tissues is projected into its top $k_i$ PCs, resulting in matrix $\mathbf{P}_{i} \in \mathbb{R}^{n \times k_i}$.
-From the MAGMA framework, we know that the SSM for each gene is proportial to $\mathbf{y}^{\top} \mathbf{P}_{i} \mathbf{P}_{i}^{\top} \mathbf{y}$.
-Under the null hypothesis of no association, the covariances between the SSM of genes $i$ and $j$ is therefore given by $2 \times \mathrm{Trace}(\mathbf{P}_{i}^{\top} \mathbf{P}_{j} \mathbf{P}_{j}^{\top} \mathbf{P}_{i})$.
-The standard deviations of each SSM are given by $\sqrt{2 \times k_{i}} \times (n - 1)$.
-Therefore, the correlation between the SSMs for genes $i$ and $j$ can be written as follows:
-
-$$
-\begin{split}
-\mathbf{R}_{ij} & = \frac{2 \times \mathrm{Tr}(\mathbf{P}_{i}^{\top} \mathbf{P}_{j} \mathbf{P}_{j}^{\top} \mathbf{P}_{i})}{\sqrt{2 \times k_{i}} \times \sqrt{2 \times k_{j}} \times (n - 1)^2} \\
-& = \frac{2 \times \mathrm{Tr}(Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) \times Cor(\mathbf{P}_{j}, \mathbf{P}_{i}))}{\sqrt{2 \times k_{i}} \times \sqrt{2 \times k_{j}}},
-\end{split}
-$$ {#eq:reg:r}
-
-where columns $\mathbf{P}$ are standardized,
-$\mathrm{Tr}$ is the trace of a matrix,
-and the cross-correlation matrix between PCs $Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) \in \mathbb{R}^{k_i \times k_j}$ is given by
-
-$$
-\begin{split}
-Cor(\mathbf{P}_{i}, \mathbf{P}_{j}) & = Cor(\mathbf{T}_{i} \mathbf{V}_{i}^{\top} \mathrm{diag}(\lambda_i)^{-1/2}, \mathbf{T}_{j} \mathbf{V}_{j}^{\top} \mathrm{diag}(\lambda_j)^{-1/2}) \\
-& = \mathrm{diag}(\lambda_i)^{-1/2} \mathbf{V}_{i} (\frac{\mathbf{T}_{i}^{\top} \mathbf{T}_{j}}{n-1}) \mathbf{V}_{j}^{\top} \mathrm{diag}(\lambda_j)^{-1/2},
-\end{split}
-$$ {#eq:reg:cor_pp}
-
-where $\frac{\mathbf{T}_{i}^{\top} \mathbf{T}_{j}}{n-1} \in \mathbb{R}^{p_i \times p_j}$ is the cross-correlation matrix between the predicted expression levels of genes $i$ and $j$,
-and columns of $\mathbf{V}_{i}$ and scalars $\lambda_i$ are the eigenvectors and eigenvalues of $\mathbf{T}_{i}$, respectively.
-S-MultiXcan keeps only the top eigenvectors using a condition number threshold of $\frac{\max(\lambda_i)}{\lambda_i} < 30$.
-To estimate the correlation of predicted expression levels for genes $i$ in tissue $k$ and gene $j$ in tissue $l$, $(\mathbf{t}_k^i, \mathbf{t}_l^j)$ ($\mathbf{t}_k^i$ is the $k$th column of $\mathbf{T}_{i}$), we used [@doi:10.1371/journal.pgen.1007889]
-
-$$
-\begin{split}
-\frac{(\mathbf{T}_{i}^{\top} \mathbf{T}_{j})_{kl}}{n-1} & = Cor(\mathbf{t}_k^i, \mathbf{t}_l^j) \\
- & = \frac{ Cov(\mathbf{t}_k, \mathbf{t}_l) } { \sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
- & = \frac{ Cov(\sum_{a \in \mathrm{model}_k} w_a^k X_a, \sum_{b \in \mathrm{model}_l} w_b^l X_b) }  {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
- & = \frac{ \sum_{a \in \mathrm{model}_k \\ b \in \mathrm{model}_l} w_a^k w_b^l Cov(X_a, X_b)} {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} } \\
- & = \frac{ \sum_{a \in \mathrm{model}_k \\ b \in \mathrm{model}_l} w_a^k w_b^l \Gamma_{ab}} {\sqrt{\widehat{\mathrm{var}}(\mathbf{t}_k) \widehat{\mathrm{var}}(\mathbf{t}_l)} },
-\end{split}
-$$ {#eq:reg:corr_genes}
-
-where $X_a$ is the genotype of SNP $a$,
-$w_a^k$ is the weight of SNP $a$ for gene expression prediction in the tissue model $k$,
-and $\Gamma = \widehat{\mathrm{var}}(\mathbf{X}) = (\mathbf{X} - \mathbf{\bar{X}})^{\top} (\mathbf{X} - \mathbf{\bar{X}}) / (n-1)$ is the genotype covariance matrix using GTEx v8 as the reference panel, which is the same used in all TWAS methods described here.
-The variance of the predicted expression values of gene $i$ in tissue $k$ is estimated as [@doi:10.1038/s41467-018-03621-1]:
-
-$$
-\begin{split}
-\widehat{\mathrm{var}}(\mathbf{t}_k^i) & = (\mathbf{W}^k)^\top \Gamma^k \mathbf{W}^k \\
- & = \sum_{a \in \mathrm{model}_k \\ b \in \mathrm{model}_k} w_a^k w_b^k \Gamma_{ab}^k.
-\end{split}
-$$ {#eq:reg:var_gene}
-    """.strip().split(
-        "\n"
-    )
-    paragraph = [sentence.strip() for sentence in paragraph]
-    assert len(paragraph) == 54
-
-    model.title = "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
-    model.keywords = [
-        "genetic studies",
-        "functional genomics",
-        "gene co-expression",
-        "therapeutic targets",
-        "drug repurposing",
-        "clustering of complex traits",
-    ]
-
-    paragraph_text, paragraph_revised = ManuscriptEditor.revise_and_write_paragraph(
-        paragraph, model, "methods"
-    )
-    assert paragraph_text is not None
-    assert paragraph_revised is not None
-    assert isinstance(paragraph_revised, str)
-    assert paragraph_revised != paragraph_text
-    assert len(paragraph_revised) > 100
-    assert "<!--\nERROR:" not in paragraph_revised
-
-    # revised paragraph was finished (no incomplete sentences, which could happen
-    # if the max_tokens parameter is too low)
-    assert (paragraph_revised[-1] == ".") or (paragraph_revised[-1] == "}")
-
-    # some equations are referenced in the revised text
-    assert "$$ {#eq:reg:r}" in paragraph_revised
-    assert "$Cor(\mathbf{P}_{i}, \mathbf{P}_{j})" in paragraph_revised

From 335aace7cef9a6df5952c1ef0da9e91ffafe04f5 Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Wed, 14 Feb 2024 13:13:35 -0700
Subject: [PATCH 10/44] setup.py: update version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index eff0a4e..3425f4a 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 
 setuptools.setup(
     name="manubot-ai-editor",
-    version="0.4.10",
+    version="0.5.0",
     author="Milton Pividori",
     author_email="miltondp@gmail.com",
     description="A Manubot plugin to revise a manuscript using GPT-3",

From 9b93c0811e3fe3ff939a2abebf5b95c8b11fab7f Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Wed, 14 Feb 2024 13:36:13 -0700
Subject: [PATCH 11/44] add DebuggingManuscriptRevisionModel

---
 libs/manubot_ai_editor/models.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 9bb1bf2..077c28d 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -80,6 +80,24 @@ def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         return f"{self.revised_header}{revised_paragraph}"
 
 
+class DebuggingManuscriptRevisionModel(DummyManuscriptRevisionModel):
+    """
+    This model returns the same paragraph and important information submitted to
+    the final revision function (i.e., that hits the remote API), such as the section
+    name and the resolved prompt.
+    """
+
+    def __init__(self):
+        super().__init__(add_paragraph_marks=True)
+
+    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
+        revised_paragraph = super().revise_paragraph(paragraph_text, section_name)
+        # in addition to the paragraph start and end from the DummyManuscriptRevisionModel,
+        # add also some metadata including section name and resolved prompt
+        header = f"%%%\nMetadata:\n - Section: '{section_name}'\n - Resolved prompt: '{resolved_prompt}'"
+        return f"{header}\n{revised_paragraph.strip()}"
+
+
 class RandomManuscriptRevisionModel(ManuscriptRevisionModel):
     """
     This model takes a paragraph and randomizes the words. The paragraph has the

From cc3adb0ad85b8f7cfec7783f9549af2057f301fe Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Wed, 14 Feb 2024 14:11:12 -0700
Subject: [PATCH 12/44] update DebuggingManuscriptRevisionModel

---
 libs/manubot_ai_editor/models.py | 68 ++++++++++++++++----------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 077c28d..3787360 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -80,24 +80,6 @@ def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         return f"{self.revised_header}{revised_paragraph}"
 
 
-class DebuggingManuscriptRevisionModel(DummyManuscriptRevisionModel):
-    """
-    This model returns the same paragraph and important information submitted to
-    the final revision function (i.e., that hits the remote API), such as the section
-    name and the resolved prompt.
-    """
-
-    def __init__(self):
-        super().__init__(add_paragraph_marks=True)
-
-    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
-        revised_paragraph = super().revise_paragraph(paragraph_text, section_name)
-        # in addition to the paragraph start and end from the DummyManuscriptRevisionModel,
-        # add also some metadata including section name and resolved prompt
-        header = f"%%%\nMetadata:\n - Section: '{section_name}'\n - Resolved prompt: '{resolved_prompt}'"
-        return f"{header}\n{revised_paragraph.strip()}"
-
-
 class RandomManuscriptRevisionModel(ManuscriptRevisionModel):
     """
     This model takes a paragraph and randomizes the words. The paragraph has the
@@ -236,7 +218,7 @@ def __init__(
                 pass
 
         self.title = title
-        self.keywords = keywords
+        self.keywords = keywords if keywords is not None else []
 
         # adjust options if edits or chat endpoint was selected
         self.endpoint = "chat"
@@ -477,21 +459,9 @@ def get_max_tokens_from_error_message(error_message: str) -> dict[str, int] | No
             "tokens_in_completion": tokens_in_completion,
         }
 
-    def revise_paragraph(self, paragraph_text: str, section_name: str = None):
-        """
-        It revises a paragraph using GPT-3 completion model.
-
-        Arguments:
-            paragraph_text (str): Paragraph text to revise.
-            section_name (str): Section name of the paragraph.
-            throw_error (bool): If True, it throws an error if the API call fails.
-                If False, it returns the original paragraph text.
-
-        Returns:
-            Revised paragraph text.
-        """
+    def get_params(self, paragraph_text, section_name, resolved_prompt=None):
         max_tokens = self.get_max_tokens(paragraph_text)
-        prompt = self.get_prompt(paragraph_text, section_name)
+        prompt = self.get_prompt(paragraph_text, section_name, resolved_prompt)
 
         params = {
             "n": 1,
@@ -525,6 +495,23 @@ def revise_paragraph(self, paragraph_text: str, section_name: str = None):
 
         params.update(self.model_parameters)
 
+        return params
+
+    def revise_paragraph(self, paragraph_text: str, section_name: str = None, resolved_prompt=None):
+        """
+        It revises a paragraph using GPT-3 completion model.
+
+        Arguments:
+            paragraph_text (str): Paragraph text to revise.
+            section_name (str): Section name of the paragraph.
+            throw_error (bool): If True, it throws an error if the API call fails.
+                If False, it returns the original paragraph text.
+
+        Returns:
+            Revised paragraph text.
+        """
+        params = self.get_params(paragraph_text, section_name, resolved_prompt)
+
         retry_count = 0
         message = ""
         while message == "" and retry_count < self.retry_count:
@@ -581,3 +568,18 @@ def revise_paragraph(self, paragraph_text: str, section_name: str = None):
                 retry_count += 1
 
         return message
+
+
+class DebuggingManuscriptRevisionModel(GPT3CompletionModel):
+    """
+    This model returns the same paragraph and important information submitted to
+    the final revision function (i.e., that hits the remote API), such as the section
+    name and the resolved prompt.
+    """
+
+    def __init__(self, title: str = "", keywords: list[str] = None, **kwargs):
+        super().__init__(title, keywords, **kwargs)
+
+    def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
+        params = self.get_params(paragraph_text, section_name)
+        return f"%%%PARAGRAPH START%%%\n{params}\n%%%PARAGRAPH END%%%"

From 40432f066d6c1e0402f851db54bae43ce0ab574a Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Wed, 14 Feb 2024 14:53:23 -0700
Subject: [PATCH 13/44] update DebuggingManuscriptRevisionModel to pretty print
 json

---
 libs/manubot_ai_editor/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 3787360..aab9675 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 import random
 import time
+import json
 
 import openai
 
@@ -582,4 +583,4 @@ def __init__(self, title: str = "", keywords: list[str] = None, **kwargs):
 
     def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         params = self.get_params(paragraph_text, section_name)
-        return f"%%%PARAGRAPH START%%%\n{params}\n%%%PARAGRAPH END%%%"
+        return f"%%%PARAGRAPH START%%%\n{json.dumps(params, indent=4)}\n%%%PARAGRAPH END%%%"

From 4dac8073032901038af7ce7b6f253e9a9b065b96 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 13 Mar 2024 13:42:39 -0600
Subject: [PATCH 14/44] Fixes issue where default_prompt in
 ai_revision-config.yaml was treated as a literal string rather than an index
 into prompts

---
 libs/manubot_ai_editor/prompt_config.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index b463359..63105a6 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -153,12 +153,18 @@ def get_prompt_for_filename(
                 if m := re.search(pattern, filename):
                     return (prompt if prompt is not None else IGNORE_FILE, m)
 
-        # finally, return the default prompt
-        return (
-            (
-                get_obj_path(self.config, ("files", "default_prompt"))
-                if use_default
-                else None
-            ),
-            None,
-        )
+        # finally, resolve the default prompt, which we do by:
+        # 1) checking if the 'default_prompt' key exists in the config file, using 'default' if it's unspecified
+        # 2) use whatever we resolved to reference the prompt from the 'prompts' collection
+        # 3) if we can't resolve a default prompt for whatever reason, return None
+        resolved_default_prompt = None
+        if use_default and self.prompts is not None:
+            resolved_default_prompt = self.prompts.get(
+                get_obj_path(self.config, ("files", "default_prompt"), missing="default"),
+                None
+            )
+
+            if resolved_default_prompt is not None:
+                resolved_default_prompt = resolved_default_prompt.strip()
+        
+        return (resolved_default_prompt, None)

From f3e7a916a34fd161989c0229f391df48f063966f Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 13 Mar 2024 13:43:17 -0600
Subject: [PATCH 15/44] Applied typing suggestions from pylance

---
 libs/manubot_ai_editor/prompt_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index 63105a6..a475726 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -60,7 +60,7 @@ def _load_config(self) -> dict:
         except FileNotFoundError:
             return None
 
-    def _load_custom_prompts(self) -> (dict, dict):
+    def _load_custom_prompts(self) -> tuple[dict, dict]:
         """
         Loads custom prompts from ai_revision-prompts.yaml. The file
         must contain either 'prompts' or 'prompts_files' as top-level keys.
@@ -101,7 +101,7 @@ def _load_custom_prompts(self) -> (dict, dict):
 
     def get_prompt_for_filename(
         self, filename: str, use_default: bool = True
-    ) -> (Optional[str], Optional[re.Match]):
+    ) -> tuple[Optional[str], Optional[re.Match]]:
         """
         Retrieves the prompt for a given filename. It checks the following sources
         for a match in order:

From acb1d3d87b843e836f336b5c7145274b517d3697 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 13 Mar 2024 13:43:54 -0600
Subject: [PATCH 16/44] Strips prompts of leading and trailing spaces

---
 libs/manubot_ai_editor/prompt_config.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index a475726..6c6d3c8 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -137,21 +137,22 @@ def get_prompt_for_filename(
                     # since we matched, use the 'prompts' collection to return a
                     # named prompt corresponding to the one from the 'matchings'
                     # collection
-                    return (
-                        (
-                            self.prompts.get(entry["prompt"], None)
-                            if self.prompts
-                            else None
-                        ),
-                        m,
-                    )
+                    resolved_prompt = None
+
+                    if self.prompts:
+                        resolved_prompt = self.prompts.get(entry["prompt"], None)
+
+                        if resolved_prompt is not None:
+                            resolved_prompt = resolved_prompt.strip()
+
+                    return ( resolved_prompt, m, )
 
         # since we haven't found a match yet, consult ai_revision-prompts.yaml's
         # 'prompts_files' collection
         if self.prompts_files:
             for pattern, prompt in self.prompts_files.items():
                 if m := re.search(pattern, filename):
-                    return (prompt if prompt is not None else IGNORE_FILE, m)
+                    return (prompt.strip() if prompt is not None else IGNORE_FILE, m)
 
         # finally, resolve the default prompt, which we do by:
         # 1) checking if the 'default_prompt' key exists in the config file, using 'default' if it's unspecified

From bbd9b08e5905f522d91dfff812f37d2a69a8ddd4 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 13 Mar 2024 13:45:06 -0600
Subject: [PATCH 17/44] Adds warning about both files.matchings and
 prompts_files being specified. Adds tests to verify the warning is shown.

---
 libs/manubot_ai_editor/prompt_config.py       | 25 ++++++++---
 .../ai_revision-config.yaml                   | 25 +++++++++++
 .../ai_revision-prompts.yaml                  | 42 +++++++++++++++++++
 tests/test_prompt_config.py                   | 42 ++++++++++++++++++-
 4 files changed, 128 insertions(+), 6 deletions(-)
 create mode 100644 tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
 create mode 100644 tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml

diff --git a/libs/manubot_ai_editor/prompt_config.py b/libs/manubot_ai_editor/prompt_config.py
index 6c6d3c8..a455006 100644
--- a/libs/manubot_ai_editor/prompt_config.py
+++ b/libs/manubot_ai_editor/prompt_config.py
@@ -29,8 +29,8 @@ class ManuscriptPromptConfig:
     Loads configuration from two YAML files in 'content_dir':
     -  ai_revision-prompts.yaml, which contains custom prompt definitions and/or
     mappings of prompts to files
-    - ai_revision-config.yaml, containing general
-    configuration for the AI revision process
+    - ai_revision-config.yaml, containing general configuration for the AI
+    revision process
 
     After loading, the main use of this class is to resolve a prompt for a given
     filename. This is done by calling config.get_prompt_for_filename(<filename>),
@@ -43,6 +43,19 @@ def __init__(self, content_dir: str, title: str, keywords: str) -> None:
         self.config = self._load_config()
         self.prompts, self.prompts_files = self._load_custom_prompts()
 
+        # validation: both self.config.files.matchings and self.prompts_files
+        # specify filename-to-prompt mappings; if both are present, we use
+        # self.config.files, but warn the user that they should only use one
+        if (
+            self.prompts_files is not None and
+            self.config is not None and
+            self.config.get('files', {}).get('matchings') is not None
+        ):
+            print(
+                "WARNING: Both 'ai_revision-config.yaml' and 'ai_revision-prompts.yaml' specify filename-to-prompt mappings. "
+                "Only the 'ai_revision-config.yaml' file's file.matchings section will be used; prompts_files will be ignored."
+            )
+
         # storing these so they can be interpolated into prompts
         self.title = title
         self.keywords = keywords
@@ -124,9 +137,11 @@ def get_prompt_for_filename(
             if m := re.search(ignore, filename):
                 return (IGNORE_FILE, m)
 
-        # FIXME: which takes priority, the files collection in ai_revision-config.yaml
-        #  or the prompt_file? we went with config taking precendence for now
-
+        # if both ai_revision-config.yaml specifies files.matchings and
+        # ai_revision-prompts.yaml specifies prompts_files, then files.matchings
+        # takes precedence.
+        # (the user is notified of this in a validation warning in __init__)
+        
         # then, consult ai_revision-config.yaml's 'matchings' collection if a
         # match is found, use the prompt ai_revision-prompts.yaml
         for entry in get_obj_path(self.config, ("files", "matchings"), missing=[]):
diff --git a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
new file mode 100644
index 0000000..dd71dcd
--- /dev/null
+++ b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
@@ -0,0 +1,25 @@
+files:
+  matchings:
+    - files:
+        - abstract
+      prompt: abstract
+    - files:
+        - introduction
+      prompt: introduction_discussion
+    - files:
+        - 04\..+\.md
+      prompt: results
+    - files:
+        - discussion
+      prompt: introduction_discussion
+    - files:
+        - methods
+      prompt: methods
+  
+  default_prompt: This is the default prompt
+  
+  ignore:
+    - front-matter
+    - acknowledgements
+    - supplementary_material
+    - references
diff --git a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
new file mode 100644
index 0000000..dde172e
--- /dev/null
+++ b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
@@ -0,0 +1,42 @@
+prompts_files:
+  abstract: |
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  introduction|discussion: |
+    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      the research problem/question is clear,
+      the solution proposed is clear,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text is in active voice and has a clear sentence structure
+
+  results: |
+    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+      most references to figures and tables are kept,
+      the details are enough to clearly explain the outcomes,
+      sentences are concise and to the point,
+      the text minimizes the use of jargon,
+      the text grammar is correct,
+      spelling errors are fixed,
+      and the text has a clear sentence structure
+
+  methods: |
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+       most of the citations to other academic papers are kept,
+       most of the technical details are kept,
+       most references to equations (such as "Equation (@id)") are kept,
+       all equations definitions (such as '*equation_definition') are included with newlines before and after,
+       the most important symbols in equations are defined,
+       the text grammar is correct,
+       spelling errors are fixed,
+       and the text has a clear sentence structure
+
+  references: null
+
+  \.md$: |
+    Proofread the following paragraph
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index 5b4863e..f6113dd 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -137,7 +137,11 @@ def test_unresolved_gets_default_prompt():
 SINGLE_GENERIC_PROMPT_DIR = (
     Path(__file__).parent / "config_loader_fixtures" / "single_generic_prompt"
 )
-
+# - Both ai_revision-config.yaml and ai-revision-prompts.yaml specify filename matchings
+#   (conflicting_promptsfiles_matchings)
+CONFLICTING_PROMPTSFILES_MATCHINGS_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "conflicting_promptsfiles_matchings"
+)
 # ---
 # test ManuscriptEditor.prompt_config sub-attributes are set correctly
 # ---
@@ -195,6 +199,42 @@ def test_single_generic_loaded():
     assert editor.prompt_config.config is not None
 
 
+@mock.patch(
+    "builtins.open", mock_unify_open(MANUSCRIPTS_DIR, CONFLICTING_PROMPTSFILES_MATCHINGS_DIR)
+)
+def test_conflicting_sources_warning(capfd):
+    """
+    Tests that a warning is printed when both ai_revision-prompts.yaml and
+    ai_revision-config.yaml specify filename-to-prompt mappings.
+
+    Specifically, the dicts that map filenames to prompts are:
+    - ai_revision-prompts.yaml: 'prompts_files'
+    - ai_revision-config.yaml: 'files.matchings'
+
+    If both are specified, the 'files.matchings' key in ai_revision-config.yaml
+    takes precedence, but a warning is printed.
+    """
+
+    editor = get_editor()
+
+    # ensure that only the prompts defined in ai_revision-prompts.yaml are loaded
+    assert editor.prompt_config.prompts is None
+    assert editor.prompt_config.config is not None
+    # for this test, we define both prompts_files and files.matchings which
+    # creates a conflict that produces the warning we're looking for
+    assert editor.prompt_config.prompts_files is not None
+    assert editor.prompt_config.config['files']['matchings'] is not None
+
+    expected_warning = (
+        "WARNING: Both 'ai_revision-config.yaml' and "
+        "'ai_revision-prompts.yaml' specify filename-to-prompt mappings. Only the "
+        "'ai_revision-config.yaml' file's file.matchings section will be used; "
+        "prompts_files will be ignored."
+    )
+
+    out, _ = capfd.readouterr()
+    assert expected_warning in out
+
 # ---
 # test that ignored files are ignored in applicable scenarios
 # ---

From 2e7d8975f9795fbe5d9bba0ef84e3d513e7dbc0a Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 13 Mar 2024 13:45:54 -0600
Subject: [PATCH 18/44] Updates phenoplier prompt config tests to treat the
 config prompt reference as a reference into prompts, not a literal.

---
 .../phenoplier_full/ai_revision-config.yaml                   | 2 +-
 .../phenoplier_full/ai_revision-prompts.yaml                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml b/tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml
index d166741..553e24e 100644
--- a/tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml
+++ b/tests/config_loader_fixtures/phenoplier_full/ai_revision-config.yaml
@@ -16,7 +16,7 @@ files:
         - methods
       prompt: methods
   
-  default_prompt: default prompt text
+  default_prompt: my_default_prompt
   
   ignore:
     - front\-matter
diff --git a/tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml b/tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml
index acb4594..f17a880 100644
--- a/tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/phenoplier_full/ai_revision-prompts.yaml
@@ -11,5 +11,5 @@ prompts:
   methods: |
     Test match methods.
 
-  default: |
-    Proofread the following paragraph
+  my_default_prompt: |
+    default prompt text

From e8dfb238095f76ee1556ec301c91bf111b20c6a3 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 27 Mar 2024 12:50:59 -0600
Subject: [PATCH 19/44] Fixes issue where resolved_prompt's value wasn't being
 propogated down to all invocations of model.get_prompt()

---
 libs/manubot_ai_editor/editor.py | 2 +-
 libs/manubot_ai_editor/models.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
index a445dbf..afbf69a 100644
--- a/libs/manubot_ai_editor/editor.py
+++ b/libs/manubot_ai_editor/editor.py
@@ -433,7 +433,7 @@ def revise_file(
                     paragraph,
                     revision_model,
                     section_name,
-                    resolved_prompt=None,
+                    resolved_prompt=resolved_prompt,
                     outfile=outfile,
                 )
 
diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index aab9675..0717060 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -34,7 +34,7 @@ def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         raise NotImplementedError
 
     @abstractmethod
-    def get_prompt(self, paragraph_text, section_name):
+    def get_prompt(self, paragraph_text, section_name, resolved_prompt: str = None):
         """
         Returns the prompt to be used for the revision of a paragraph that
         belongs to a given section.
@@ -63,7 +63,7 @@ def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
 
         return self.sentence_end_pattern.sub(". ", paragraph_text).strip()
 
-    def get_prompt(self, paragraph_text, section_name):
+    def get_prompt(self, paragraph_text, section_name, resolved_prompt: str = None):
         return paragraph_text
 
 
@@ -114,7 +114,7 @@ def revise_paragraph(
             sentences_revised.append(" ".join(words_revised))
         return ". ".join(sentences_revised)
 
-    def get_prompt(self, paragraph_text, section_name):
+    def get_prompt(self, paragraph_text, section_name, resolved_prompt: str = None):
         return paragraph_text
 
 

From 55972b494ec6a899827a58cdf07644d2cb28df5f Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 27 Mar 2024 12:52:40 -0600
Subject: [PATCH 20/44] Applies placeholder replacements to ai_revision-derived
 prompts, as it was with the other prompts.

---
 libs/manubot_ai_editor/models.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 0717060..849d4f4 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -312,7 +312,14 @@ def get_prompt(
             prompt = custom_prompt.format(**placeholders)
         elif resolved_prompt:
             # use the resolved prompt from the ai_revision config files, if available
-            prompt = resolved_prompt
+            # replace placeholders with their actual values
+            replacements = {
+                "paragraph_text": paragraph_text.strip(),
+                "section_name": section_name,
+                "title": self.title,
+                "keywords": ", ".join(self.keywords),
+            }
+            prompt = resolved_prompt.format(**replacements)
         elif section_name in ("abstract",):
             prompt = f"""
                 Revise the following paragraph from the {section_name} of an academic paper (with the title '{self.title}' and keywords '{", ".join(self.keywords)}')

From 73870ae97e137c49ab71c091e25a7dc304f79140 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 27 Mar 2024 12:53:03 -0600
Subject: [PATCH 21/44] Adds a bit more debugging output to
 DebuggingManuscriptRevisionModel, adds a title so we can test the placeholder
 replacement of {title}.

---
 libs/manubot_ai_editor/models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 849d4f4..ea1629d 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -585,9 +585,11 @@ class DebuggingManuscriptRevisionModel(GPT3CompletionModel):
     name and the resolved prompt.
     """
 
-    def __init__(self, title: str = "", keywords: list[str] = None, **kwargs):
+    def __init__(self, title: str = "debugging-manuscript", keywords: list[str] = None, **kwargs):
         super().__init__(title, keywords, **kwargs)
 
     def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
-        params = self.get_params(paragraph_text, section_name)
-        return f"%%%PARAGRAPH START%%%\n{json.dumps(params, indent=4)}\n%%%PARAGRAPH END%%%"
+        params = self.get_params(paragraph_text, section_name, resolved_prompt)
+        json_params = json.dumps(params, indent=4)
+        print(json_params)
+        return f"%%%PARAGRAPH START%%%\n{json_params}\n%%%PARAGRAPH END%%%"

From 264d15008175f0fe3804639ec4fda4e1650b2618 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 3 Apr 2024 20:56:55 -0600
Subject: [PATCH 22/44] Changed 'default_prompt' in the tests to refer to a
 prompt key and not be literal text; added the default prompt where it was
 missing.

---
 .../both_prompts_config/ai_revision-config.yaml                 | 2 +-
 .../conflicting_promptsfiles_matchings/ai_revision-config.yaml  | 2 +-
 .../conflicting_promptsfiles_matchings/ai_revision-prompts.yaml | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml b/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
index dd71dcd..8dc5692 100644
--- a/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
+++ b/tests/config_loader_fixtures/both_prompts_config/ai_revision-config.yaml
@@ -16,7 +16,7 @@ files:
         - methods
       prompt: methods
   
-  default_prompt: This is the default prompt
+  default_prompt: default
   
   ignore:
     - front-matter
diff --git a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
index dd71dcd..8dc5692 100644
--- a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
+++ b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-config.yaml
@@ -16,7 +16,7 @@ files:
         - methods
       prompt: methods
   
-  default_prompt: This is the default prompt
+  default_prompt: default
   
   ignore:
     - front-matter
diff --git a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
index dde172e..633f0ce 100644
--- a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
@@ -38,5 +38,7 @@ prompts_files:
 
   references: null
 
+  default: This is the default prompt
+
   \.md$: |
     Proofread the following paragraph

From 88ba10ec608112605a20176bebe5abf86f1acb72 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 3 Apr 2024 20:59:01 -0600
Subject: [PATCH 23/44] Changed invalid field references to items included in
 the format() calls in GPT3CompletionModel.get_prompt()

---
 .../both_prompts_config/ai_revision-prompts.yaml          | 8 ++++----
 .../ai_revision-prompts.yaml                              | 8 ++++----
 .../only_revision_prompts/ai_revision-prompts.yaml        | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
index 281e2ab..8c8f9d6 100644
--- a/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
@@ -1,6 +1,6 @@
 prompts:
   abstract: |
-    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -8,7 +8,7 @@ prompts:
       and the text is in active voice and has a clear sentence structure
 
   introduction_discussion: |
-    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the {section_name} of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -16,7 +16,7 @@ prompts:
       and the text is in active voice and has a clear sentence structure
 
   results: |
-    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Results section of an academic paper (with the title '{title}' and keywords '{keywords}') so
       most references to figures and tables are kept,
       the details are enough to clearly explain the outcomes,
       sentences are concise and to the point,
@@ -26,7 +26,7 @@ prompts:
       and the text has a clear sentence structure
 
   methods: |
-    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{title}' and keywords '{keywords}') so
        most of the citations to other academic papers are kept,
        most of the technical details are kept,
        most references to equations (such as "Equation (@id)") are kept,
diff --git a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
index 633f0ce..3f96619 100644
--- a/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/conflicting_promptsfiles_matchings/ai_revision-prompts.yaml
@@ -1,6 +1,6 @@
 prompts_files:
   abstract: |
-    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -8,7 +8,7 @@ prompts_files:
       and the text is in active voice and has a clear sentence structure
 
   introduction|discussion: |
-    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the {section_name} of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -16,7 +16,7 @@ prompts_files:
       and the text is in active voice and has a clear sentence structure
 
   results: |
-    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Results section of an academic paper (with the title '{title}' and keywords '{keywords}') so
       most references to figures and tables are kept,
       the details are enough to clearly explain the outcomes,
       sentences are concise and to the point,
@@ -26,7 +26,7 @@ prompts_files:
       and the text has a clear sentence structure
 
   methods: |
-    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{title}' and keywords '{keywords}') so
        most of the citations to other academic papers are kept,
        most of the technical details are kept,
        most references to equations (such as "Equation (@id)") are kept,
diff --git a/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml b/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
index dde172e..9539aca 100644
--- a/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/only_revision_prompts/ai_revision-prompts.yaml
@@ -1,6 +1,6 @@
 prompts_files:
   abstract: |
-    Revise the following paragraph from the Abstract of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Abstract of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -8,7 +8,7 @@ prompts_files:
       and the text is in active voice and has a clear sentence structure
 
   introduction|discussion: |
-    Revise the following paragraph from the {file.section.capitalize()} of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the {section_name} of an academic paper (with the title '{title}' and keywords '{keywords}') so
       the research problem/question is clear,
       the solution proposed is clear,
       the text grammar is correct,
@@ -16,7 +16,7 @@ prompts_files:
       and the text is in active voice and has a clear sentence structure
 
   results: |
-    Revise the following paragraph from the Results section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the following paragraph from the Results section of an academic paper (with the title '{title}' and keywords '{keywords}') so
       most references to figures and tables are kept,
       the details are enough to clearly explain the outcomes,
       sentences are concise and to the point,
@@ -26,7 +26,7 @@ prompts_files:
       and the text has a clear sentence structure
 
   methods: |
-    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{manuscript.title}' and keywords '{manuscript.keywords}') so
+    Revise the paragraph(s) below from the Methods section of an academic paper (with the title '{title}' and keywords '{keywords}') so
        most of the citations to other academic papers are kept,
        most of the technical details are kept,
        most references to equations (such as "Equation (@id)") are kept,

From e27a99649827b8359b073c371cd6fa6598c6e5ee Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 3 Apr 2024 21:00:25 -0600
Subject: [PATCH 24/44] Adds an end-to-end test that applies custom prompts via
 the DebuggingManuscriptRevisionModel  and then looks for the prompts' text in
 the resulting .md files

---
 .../ai_revision-config.yaml                   | 47 ++++++++++
 .../ai_revision-prompts.yaml                  | 17 ++++
 tests/test_prompt_config.py                   | 85 +++++++++++++++++--
 3 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 tests/config_loader_fixtures/prompt_propogation/ai_revision-config.yaml
 create mode 100644 tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml

diff --git a/tests/config_loader_fixtures/prompt_propogation/ai_revision-config.yaml b/tests/config_loader_fixtures/prompt_propogation/ai_revision-config.yaml
new file mode 100644
index 0000000..41fab5d
--- /dev/null
+++ b/tests/config_loader_fixtures/prompt_propogation/ai_revision-config.yaml
@@ -0,0 +1,47 @@
+files:
+  matchings:
+    - files:
+      - front-matter
+      prompt: front_matter
+    - files:
+      - abstract
+      prompt: abstract
+    - files:
+      - introduction
+      prompt: introduction
+    - files:
+      - results_framework
+      prompt: results_framework
+    - files:
+      - results
+      prompt: results
+    - files:
+      - crispr
+      prompt: crispr
+    - files:
+      - drug_disease_prediction
+      prompt: drug_disease_prediction
+    - files:
+      - traits_clustering
+      prompt: traits_clustering
+    - files:
+      - discussion
+      prompt: discussion
+    - files:
+      - methods
+      prompt: methods
+    - files:
+      - references
+      prompt: references
+    - files:
+      - acknowledgements
+      prompt: acknowledgements
+    - files:
+      - supplementary_material
+      prompt: supplementary_material
+
+  default_prompt: default
+
+  ignore:
+    - results
+    - references
diff --git a/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml b/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml
new file mode 100644
index 0000000..bbb95a4
--- /dev/null
+++ b/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml
@@ -0,0 +1,17 @@
+prompts:
+  front_matter: This is the front-matter prompt
+  abstract: This is the abstract prompt
+  introduction: This is the introduction prompt
+  results: This is the results prompt
+  results_framework: This is the results_framework prompt
+  crispr: This is the crispr prompt
+  drug_disease_prediction: This is the drug_disease_prediction prompt
+  traits_clustering: This is the traits_clustering prompt
+  discussion: This is the discussion prompt
+  methods: This is the methods prompt
+  references: This is the references prompt
+  acknowledgements: This is the acknowledgements prompt
+  supplementary_material: This is the supplementary_material prompt
+
+  default: |
+    This is the default prompt
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index f6113dd..83bbb92 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -2,7 +2,11 @@
 from unittest import mock
 
 from manubot_ai_editor.editor import ManuscriptEditor
-from manubot_ai_editor.models import GPT3CompletionModel, RandomManuscriptRevisionModel
+from manubot_ai_editor.models import (
+    GPT3CompletionModel,
+    RandomManuscriptRevisionModel,
+    DebuggingManuscriptRevisionModel
+)
 from manubot_ai_editor.prompt_config import IGNORE_FILE
 import pytest
 
@@ -119,7 +123,8 @@ def test_unresolved_gets_default_prompt():
 
 
 # ==============================================================================
-# === prompts_files tests, using ai_revision-prompts.yaml w/ai_revision-config.yaml to process ignores, defaults
+# === prompts_files tests, using ai_revision-prompts.yaml w/
+# === ai_revision-config.yaml to process ignores, defaults
 # ==============================================================================
 
 # the following tests are derived from examples in
@@ -235,9 +240,10 @@ def test_conflicting_sources_warning(capfd):
     out, _ = capfd.readouterr()
     assert expected_warning in out
 
-# ---
-# test that ignored files are ignored in applicable scenarios
-# ---
+
+# ==============================================================================
+# === test that ignored files are ignored in applicable scenarios
+# ==============================================================================
 
 # places in configs where files can be ignored:
 # ai_revision-config.yaml: the `files.ignore` key
@@ -248,7 +254,8 @@ def test_conflicting_sources_warning(capfd):
     "model",
     [
         RandomManuscriptRevisionModel(),
-        GPT3CompletionModel(None, None),
+        DebuggingManuscriptRevisionModel()
+        # GPT3CompletionModel(None, None),
     ],
 )
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, BOTH_PROMPTS_CONFIG_DIR))
@@ -267,3 +274,69 @@ def test_revise_entire_manuscript(tmp_path, model):
     # after processing ignores, we should be left with 9 files from the original 12
     output_md_files = list(output_folder.glob("*.md"))
     assert len(output_md_files) == 9
+
+# ==============================================================================
+# === end-to-end tests, to verify that the prompts are making it into the final result
+# ==============================================================================
+
+PROMPT_PROPOGATION_CONFIG_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "prompt_propogation"
+)
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        DebuggingManuscriptRevisionModel(),
+    ],
+)
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PROMPT_PROPOGATION_CONFIG_DIR))
+def test_prompts_in_final_result(tmp_path, model):
+    """
+    Tests that the prompts are making it into the final resulting .md files.
+
+    This test uses the DebuggingManuscriptRevisionModel, which is a model that
+    inserts the prompt and other parameters into the final result. Using this
+    model, we can test that the prompt we entered is used when applying the LLM.
+
+    Note that 04.00.results.md contains no actual text, just a comment, so
+    there's no paragraphs to assign a prompt and thus no result; we explicitly
+    ignore the file in the config and in the test below.
+
+    10.references.md also contains no actual text, just an HTML element where
+    the references get inserted by another system (assumedly manubot), so we
+    ignore it in the config and in this test as well.
+    """
+    me = get_editor()
+
+    model.title = me.title
+    model.keywords = me.keywords
+
+    output_folder = tmp_path
+    assert output_folder.exists()
+
+    me.revise_manuscript(output_folder, model)
+
+    # mapping of filenames to prompts to check in the result
+    files_to_prompts = {
+        "00.front-matter.md": "This is the front-matter prompt",
+        "01.abstract.md": "This is the abstract prompt",
+        "02.introduction.md": "This is the introduction prompt",
+        # "04.00.results.md": "This is the results prompt",
+        "04.05.00.results_framework.md": "This is the results_framework prompt",
+        "04.05.01.crispr.md": "This is the crispr prompt",
+        "04.15.drug_disease_prediction.md": "This is the drug_disease_prediction prompt",
+        "04.20.00.traits_clustering.md": "This is the traits_clustering prompt",
+        "05.discussion.md": "This is the discussion prompt",
+        "07.00.methods.md": "This is the methods prompt",
+        # "10.references.md": "This is the references prompt",
+        "15.acknowledgements.md": "This is the acknowledgements prompt",
+        "50.00.supplementary_material.md": "This is the supplementary_material prompt",
+    }
+
+    # check that the prompts are in the final result
+    output_md_files = list(output_folder.glob("*.md"))
+
+    for output_md_file in output_md_files:
+        with open(output_md_file, "r") as f:
+            content = f.read()
+            assert files_to_prompts[output_md_file.name].strip() in content

From 8b92c45cb3b47d5ce664122e71ddfb995061fa82 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 4 Apr 2024 20:05:46 -0600
Subject: [PATCH 25/44] Adds e2e test that uses the GPT3 model to revise each
 paragraph to include a sentinel value

---
 .../prompt_gpt3_e2e/ai_revision-config.yaml   | 47 +++++++++++++
 .../prompt_gpt3_e2e/ai_revision-prompts.yaml  | 17 +++++
 tests/test_prompt_config.py                   | 67 +++++++++++++++++++
 3 files changed, 131 insertions(+)
 create mode 100644 tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-config.yaml
 create mode 100644 tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-prompts.yaml

diff --git a/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-config.yaml b/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-config.yaml
new file mode 100644
index 0000000..41fab5d
--- /dev/null
+++ b/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-config.yaml
@@ -0,0 +1,47 @@
+files:
+  matchings:
+    - files:
+      - front-matter
+      prompt: front_matter
+    - files:
+      - abstract
+      prompt: abstract
+    - files:
+      - introduction
+      prompt: introduction
+    - files:
+      - results_framework
+      prompt: results_framework
+    - files:
+      - results
+      prompt: results
+    - files:
+      - crispr
+      prompt: crispr
+    - files:
+      - drug_disease_prediction
+      prompt: drug_disease_prediction
+    - files:
+      - traits_clustering
+      prompt: traits_clustering
+    - files:
+      - discussion
+      prompt: discussion
+    - files:
+      - methods
+      prompt: methods
+    - files:
+      - references
+      prompt: references
+    - files:
+      - acknowledgements
+      prompt: acknowledgements
+    - files:
+      - supplementary_material
+      prompt: supplementary_material
+
+  default_prompt: default
+
+  ignore:
+    - results
+    - references
diff --git a/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-prompts.yaml b/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-prompts.yaml
new file mode 100644
index 0000000..548a0f6
--- /dev/null
+++ b/tests/config_loader_fixtures/prompt_gpt3_e2e/ai_revision-prompts.yaml
@@ -0,0 +1,17 @@
+prompts:
+  front_matter: Revise the following paragraph to include the keyword "testify" somewhere in the text.
+  abstract: Revise the following paragraph to include the keyword "bottle" somewhere in the text.
+  introduction: Revise the following paragraph to include the keyword "wound" somewhere in the text.
+  results: Revise the following paragraph to include the keyword "classroom" somewhere in the text.
+  results_framework: Revise the following paragraph to include the keyword "secretary" somewhere in the text.
+  crispr: Revise the following paragraph to include the keyword "army" somewhere in the text.
+  drug_disease_prediction: Revise the following paragraph to include the keyword "breakdown" somewhere in the text.
+  traits_clustering: Revise the following paragraph to include the keyword "siege" somewhere in the text.
+  discussion: Revise the following paragraph to include the keyword "beer" somewhere in the text.
+  methods: Revise the following paragraph to include the keyword "confront" somewhere in the text.
+  references: Revise the following paragraph to include the keyword "disability" somewhere in the text.
+  acknowledgements: Revise the following paragraph to include the keyword "stitch" somewhere in the text.
+  supplementary_material: Revise the following paragraph to include the keyword "waiter" somewhere in the text.
+
+  default: |
+    This is the default prompt
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index 83bbb92..d2bb2de 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -340,3 +340,70 @@ def test_prompts_in_final_result(tmp_path, model):
         with open(output_md_file, "r") as f:
             content = f.read()
             assert files_to_prompts[output_md_file.name].strip() in content
+
+# live GPT version of the test, with a different prompt
+
+PROMPT_PROPOGATION_CONFIG_DIR = (
+    Path(__file__).parent / "config_loader_fixtures" / "prompt_gpt3_e2e"
+)
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        GPT3CompletionModel(
+            title="Debug Manuscript", keywords=["debug"],
+            model_engine="gpt-3.5-turbo"
+        ),
+    ],
+)
+@pytest.mark.cost
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PROMPT_PROPOGATION_CONFIG_DIR))
+def test_prompts_apply_gpt3(tmp_path, model):
+    """
+    Tests that the custom prompts are applied when actually applying
+    the prompts to an LLM.
+
+    This test uses the GPT3CompletionModel, which performs a query againts
+    the live OpenAI service, thus it does incur cost. Because of that,
+    this test is marked 'cost' and requires the --runcost argument to be run,
+    e.g. to run just this test: `pytest --runcost -k test_prompts_apply_gpt3`.
+
+    As with test_prompts_in_final_result above, files that have no input and 
+    thus no applied prompt are ignored.
+    """
+    me = get_editor()
+
+    model.title = me.title
+    model.keywords = me.keywords
+
+    output_folder = tmp_path
+    assert output_folder.exists()
+
+    me.revise_manuscript(output_folder, model)
+
+    # mapping of filenames to keywords, present in the prompt, to check in the
+    # result. (these words were generated by https://randomwordgenerator.com/,
+    # fyi, not chosen for any particular reason)
+    files_to_keywords = {
+        "00.front-matter.md": "testify",
+        "01.abstract.md": "bottle",
+        "02.introduction.md": "wound",
+        # "04.00.results.md": "classroom",
+        "04.05.00.results_framework.md": "secretary",
+        "04.05.01.crispr.md": "army",
+        "04.15.drug_disease_prediction.md": "breakdown",
+        "04.20.00.traits_clustering.md": "siege",
+        "05.discussion.md": "beer",
+        "07.00.methods.md": "confront",
+        # "10.references.md": "disability",
+        "15.acknowledgements.md": "stitch",
+        "50.00.supplementary_material.md": "waiter",
+    }
+
+    # check that the prompts are in the final result
+    output_md_files = list(output_folder.glob("*.md"))
+
+    for output_md_file in output_md_files:
+        with open(output_md_file, "r") as f:
+            content = f.read()
+            assert files_to_keywords[output_md_file.name].strip() in content

From 8c39e8f51297748deea620dac112b7340dc31884 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 10 Apr 2024 11:35:47 -0600
Subject: [PATCH 26/44] Removes unnecessary printing of params in
 DebuggingManuscriptRevisionModel

---
 libs/manubot_ai_editor/models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index ea1629d..fc90c2b 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -591,5 +591,4 @@ def __init__(self, title: str = "debugging-manuscript", keywords: list[str] = No
     def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         params = self.get_params(paragraph_text, section_name, resolved_prompt)
         json_params = json.dumps(params, indent=4)
-        print(json_params)
         return f"%%%PARAGRAPH START%%%\n{json_params}\n%%%PARAGRAPH END%%%"

From 4377d80adbbe8b3430cf5e86be7effdb4d90ec21 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 10 Apr 2024 11:39:56 -0600
Subject: [PATCH 27/44] Per MP's idea, adds single-paragraph version of
 phenoplier, used in test_prompts_apply_gpt3. Adds manuscript_dir argument to
 get_editor() so we can specify a different manuscript folder.

---
 .../00.front-matter.md                        |   77 +
 .../01.abstract.md                            |   12 +
 .../02.introduction.md                        |    7 +
 .../04.00.results.md                          |   10 +
 .../04.05.00.results_framework.md             |    9 +
 .../04.05.01.crispr.md                        |    8 +
 .../04.15.drug_disease_prediction.md          |    9 +
 .../04.20.00.traits_clustering.md             |   11 +
 .../05.discussion.md                          |    8 +
 .../07.00.methods.md                          |    9 +
 .../10.references.md                          |    4 +
 .../15.acknowledgements.md                    |   13 +
 .../50.00.supplementary_material.md           |   13 +
 .../manual-references.json                    | 9296 +++++++++++++++++
 .../metadata.yaml                             |  134 +
 tests/test_prompt_config.py                   |   47 +-
 16 files changed, 9641 insertions(+), 26 deletions(-)
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/00.front-matter.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/01.abstract.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/02.introduction.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/04.00.results.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/04.05.00.results_framework.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/04.05.01.crispr.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/04.15.drug_disease_prediction.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/04.20.00.traits_clustering.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/05.discussion.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/07.00.methods.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/10.references.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/15.acknowledgements.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/50.00.supplementary_material.md
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/manual-references.json
 create mode 100644 tests/manuscripts/phenoplier_full_only_first_para/metadata.yaml

diff --git a/tests/manuscripts/phenoplier_full_only_first_para/00.front-matter.md b/tests/manuscripts/phenoplier_full_only_first_para/00.front-matter.md
new file mode 100644
index 0000000..c86f5cb
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/00.front-matter.md
@@ -0,0 +1,77 @@
+{##
+  This file contains a Jinja2 front-matter template that adds version and authorship information.
+  Changing the Jinja2 templates in this file may cause incompatibility with Manubot updates.
+  Pandoc automatically inserts title from metadata.yaml, so it is not included in this template.
+##}
+
+_A DOI-citable version of this manuscript is available at<br /><https://doi.org/10.1038/s41467-023-41057-4>_
+
+<!-- {## Template to insert build date and source ##}
+<small><em>
+This manuscript
+{% if manubot.ci_source is defined and manubot.ci_source.provider == "appveyor" -%}
+([permalink]({{manubot.ci_source.artifact_url}}))
+{% elif manubot.html_url_versioned is defined -%}
+([permalink]({{manubot.html_url_versioned}}))
+{% endif -%}
+was automatically generated
+{% if manubot.ci_source is defined -%}
+from [{{manubot.ci_source.repo_slug}}@{{manubot.ci_source.commit | truncate(length=7, end='', leeway=0)}}](https://github.com/{{manubot.ci_source.repo_slug}}/tree/{{manubot.ci_source.commit}})
+{% endif -%}
+on {{manubot.generated_date_long}}.
+</em></small> -->
+
+{% if manubot.date_long != manubot.generated_date_long -%}
+Published: {{manubot.date_long}}
+{% endif %}
+
+## Authors
+
+{## Template for listing authors ##}
+{% for author in manubot.authors %}
++ **{{author.name}}**
+  {% if author.corresponding is defined and author.corresponding == true -%}^[✉](#correspondence)^{%- endif -%}
+  <br>
+  {%- set has_ids = false %}
+  {%- if author.orcid is defined and author.orcid is not none %}
+    {%- set has_ids = true %}
+    ![ORCID icon](images/orcid.svg){.inline_icon width=16 height=16}
+    [{{author.orcid}}](https://orcid.org/{{author.orcid}})
+  {%- endif %}
+  {%- if author.github is defined and author.github is not none %}
+    {%- set has_ids = true %}
+    · ![GitHub icon](images/github.svg){.inline_icon width=16 height=16}
+    [{{author.github}}](https://github.com/{{author.github}})
+  {%- endif %}
+  {%- if author.twitter is defined and author.twitter is not none %}
+    {%- set has_ids = true %}
+    · ![Twitter icon](images/twitter.svg){.inline_icon width=16 height=16}
+    [{{author.twitter}}](https://twitter.com/{{author.twitter}})
+  {%- endif %}
+  {%- if author.mastodon is defined and author.mastodon is not none and author["mastodon-server"] is defined and author["mastodon-server"] is not none %}
+    {%- set has_ids = true %}
+    · ![Mastodon icon](images/mastodon.svg){.inline_icon width=16 height=16}
+    [\@{{author.mastodon}}@{{author["mastodon-server"]}}](https://{{author["mastodon-server"]}}/@{{author.mastodon}})
+  {%- endif %}
+  {%- if has_ids %}
+    <br>
+  {%- endif %}
+  <small>
+  {%- if author.affiliations is defined and author.affiliations|length %}
+     {{author.affiliations | join('; ')}}
+  {%- endif %}
+  {%- if author.funders is defined and author.funders|length %}
+     · Funded by {{author.funders | join('; ')}}
+  {%- endif %}
+  </small>
+{% endfor %}
+
+::: {#correspondence}
+✉ — Correspondence possible via {% if manubot.ci_source is defined -%}[GitHub Issues](https://github.com/{{manubot.ci_source.repo_slug}}/issues){% else %}GitHub Issues{% endif %}
+{% if manubot.authors|map(attribute='corresponding')|select|max -%}
+or email to
+{% for author in manubot.authors|selectattr("corresponding") -%}
+{{ author.name }} \<{{ author.email }}\>{{ ", " if not loop.last else "." }}
+{% endfor %}
+{% endif %}
+:::
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/01.abstract.md b/tests/manuscripts/phenoplier_full_only_first_para/01.abstract.md
new file mode 100644
index 0000000..6976ff0
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/01.abstract.md
@@ -0,0 +1,12 @@
+## Abstract {.page_break_before}
+
+Genes act in concert with each other in specific contexts to perform their functions.
+Determining how these genes influence complex traits requires a mechanistic understanding of expression regulation across different conditions.
+It has been shown that this insight is critical for developing new therapies.
+Transcriptome-wide association studies have helped uncover the role of individual genes in disease-relevant mechanisms.
+However, modern models of the architecture of complex traits predict that gene-gene interactions play a crucial role in disease origin and progression.
+Here we introduce PhenoPLIER, a computational approach that maps gene-trait associations and pharmacological perturbation data into a common latent representation for a joint analysis.
+This representation is based on modules of genes with similar expression patterns across the same conditions.
+We observe that diseases are significantly associated with gene modules expressed in relevant cell types, and our approach is accurate in predicting known drug-disease pairs and inferring mechanisms of action.
+Furthermore, using a CRISPR screen to analyze lipid regulation, we find that functionally important players lack associations but are prioritized in trait-associated modules by PhenoPLIER.
+By incorporating groups of co-expressed genes, PhenoPLIER can contextualize genetic associations and reveal potential targets missed by single-gene strategies.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/02.introduction.md b/tests/manuscripts/phenoplier_full_only_first_para/02.introduction.md
new file mode 100644
index 0000000..f0cb55f
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/02.introduction.md
@@ -0,0 +1,7 @@
+## Introduction
+
+Genes work together in context-specific networks to carry out different functions [@pmid:19104045; @doi:10.1038/ng.3259].
+Variations in these genes can change their functional role and, at a higher level, affect disease-relevant biological processes [@doi:10.1038/s41467-018-06022-6].
+In this context, determining how genes influence complex traits requires mechanistically understanding expression regulation across different cell types [@doi:10.1126/science.aaz1776; @doi:10.1038/s41586-020-2559-3; @doi:10.1038/s41576-019-0200-9], which in turn should lead to improved treatments [@doi:10.1038/ng.3314; @doi:10.1371/journal.pgen.1008489].
+Previous studies have described different regulatory DNA elements [@doi:10.1038/nature11247; @doi:10.1038/nature14248; @doi:10.1038/nature12787; @doi:10.1038/s41586-020-03145-z; @doi:10.1038/s41586-020-2559-3] including genetic effects on gene expression across different tissues [@doi:10.1126/science.aaz1776].
+Integrating functional genomics data and GWAS data [@doi:10.1038/s41588-018-0081-4; @doi:10.1016/j.ajhg.2018.04.002; @doi:10.1038/s41588-018-0081-4; @doi:10.1038/ncomms6890] has improved the identification of these transcriptional mechanisms that, when dysregulated, commonly result in tissue- and cell lineage-specific pathology [@pmid:20624743; @pmid:14707169; @doi:10.1073/pnas.0810772105].
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/04.00.results.md b/tests/manuscripts/phenoplier_full_only_first_para/04.00.results.md
new file mode 100644
index 0000000..60da89d
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/04.00.results.md
@@ -0,0 +1,10 @@
+## Results
+
+<!--
+
+Some papers that might be interesting:
+
+https://www.nature.com/articles/s41591-020-01221-5
+Air pollution linked to neurodegeneration markers
+
+-->
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/04.05.00.results_framework.md b/tests/manuscripts/phenoplier_full_only_first_para/04.05.00.results_framework.md
new file mode 100644
index 0000000..ebc8c13
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/04.05.00.results_framework.md
@@ -0,0 +1,9 @@
+### PhenoPLIER: an integration framework based on gene co-expression patterns
+
+PhenoPLIER is a flexible computational framework that combines gene-trait and gene-drug associations with gene modules expressed in specific contexts (Figure {@fig:entire_process}a).
+The approach uses a latent representation (with latent variables or LVs representing gene modules) derived from a large gene expression compendium (Figure {@fig:entire_process}b, top) to integrate TWAS with drug-induced transcriptional responses (Figure {@fig:entire_process}b, middle) for a joint analysis.
+The approach consists in three main components (Figure {@fig:entire_process}b, bottom, see [Methods](#sec:methods)):
+1) an LV-based regression model to compute an association between an LV and a trait,
+2) a clustering framework to learn groups of traits with shared transcriptomic properties,
+and 3) an LV-based drug repurposing approach that links diseases to potential treatments.
+We performed extensive simulations for our regression model ([Supplementary Note 1](#sm:reg:null_sim)) and clustering framework ([Supplementary Note 2](#sm:clustering:null_sim)) to ensure proper calibration and expected results under a model of no association.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/04.05.01.crispr.md b/tests/manuscripts/phenoplier_full_only_first_para/04.05.01.crispr.md
new file mode 100644
index 0000000..d51e698
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/04.05.01.crispr.md
@@ -0,0 +1,8 @@
+### LVs link genes that alter lipid accumulation with relevant traits and tissues
+
+Our first experiment attempted to answer whether genes in a disease-relevant LV could represent potential therapeutic targets.
+For this, the first step was to obtain a set of genes strongly associated with a phenotype of interest.
+Therefore, we performed a fluorescence-based CRISPR-Cas9 in the HepG2 cell line and identified 462 genes associated with lipid regulation ([Methods](#sec:methods:crispr)).
+From these, we selected two high-confidence gene sets that either caused a decrease or increase of lipids:
+a lipids-decreasing gene-set with eight genes: *BLCAP*, *FBXW7*, *INSIG2*, *PCYT2*, *PTEN*, *SOX9*, *TCF7L2*, *UBE2J2*;
+and a lipids-increasing gene-set with six genes: *ACACA*, *DGAT2*, *HILPDA*, *MBTPS1*, *SCAP*, *SRPR* (Supplementary Data 2).
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/04.15.drug_disease_prediction.md b/tests/manuscripts/phenoplier_full_only_first_para/04.15.drug_disease_prediction.md
new file mode 100644
index 0000000..7cbdae2
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/04.15.drug_disease_prediction.md
@@ -0,0 +1,9 @@
+### LVs predict drug-disease pairs better than single genes
+
+We next determined how substituting LVs for individual genes predicted known treatment-disease relationships.
+For this, we used the transcriptional responses to small molecule perturbations profiled in LINCS L1000 [@doi:10.1016/j.cell.2017.10.049], which were further processed and mapped to DrugBank IDs [@doi:10.1093/nar/gkt1068; @doi:10.7554/eLife.26726; @doi:10.5281/zenodo.47223].
+Based on an established drug repurposing strategy that matches reversed transcriptome patterns between genes and drug-induced perturbations [@doi:10.1126/scitranslmed.3002648; @doi:10.1126/scitranslmed.3001318], we adopted a previously described framework that uses imputed transcriptomes from TWAS to prioritize drug candidates [@doi:10.1038/nn.4618].
+For this, we computed a drug-disease score by calculating the negative dot product between the $z$-scores for a disease (from TWAS) and the $z$-scores for a drug (from LINCS) across sets of genes of different sizes (see [Methods](#sec:methods:drug)).
+Therefore, a large score for a drug-disease pair indicated that higher (lower) predicted expression values of disease-associated genes are down (up)-regulated by the drug, thus predicting a potential treatment.
+Similarly, for the LV-based approach, we estimated how pharmacological perturbations affected the gene module activity by projecting expression profiles of drugs into our latent representation (Figure {@fig:entire_process}b).
+We used a manually-curated gold standard set of drug-disease medical indications [@doi:10.7554/eLife.26726; @doi:10.5281/zenodo.47664] for 322 drugs across 53 diseases to evaluate the prediction performance.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/04.20.00.traits_clustering.md b/tests/manuscripts/phenoplier_full_only_first_para/04.20.00.traits_clustering.md
new file mode 100644
index 0000000..5999b40
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/04.20.00.traits_clustering.md
@@ -0,0 +1,11 @@
+### LVs reveal trait clusters with shared transcriptomic properties
+
+We used the projection of gene-trait associations into the latent space to find groups of clusters linked by the same transcriptional processes.
+Since individual clustering algorithms have different biases (i.e., assumptions about the data structure), we designed a consensus clustering framework that combines solutions or partitions of traits generated by different methods ([Methods](#sec:methods:clustering)).
+Consensus or ensemble approaches have been recommended to avoid several pitfalls when performing cluster analysis on biological data [@doi:10.1126/scisignal.aad1932].
+Since diversity in the ensemble is crucial for these methods, we generated different data versions which were processed using different methods with varying sets of parameters (Figure {@fig:clustering:design}a).
+Then, a consensus function combines the ensemble into a consolidated solution, which has been shown to outperform any individual member of the ensemble [@Strehl2002; @doi:10.1109/TPAMI.2005.113].
+Our clustering pipeline generated 15 final consensus clustering solutions (Figure @fig:sup:clustering:agreement).
+The number of clusters of these partitions (between 5 to 29) was learned from the data by selecting the partitions with the largest agreement with the ensemble [@Strehl2002].
+Instead of selecting one of these final solutions with a specific number of clusters, we used a clustering tree [@doi:10.1093/gigascience/giy083] (Figure @fig:clustering:tree) to examine stable groups of traits across multiple resolutions.
+To understand which latent variables differentiated the group of traits, we trained a decision tree classifier on the input data $\hat{\mathbf{M}}$ using the clusters found as labels (Figure {@fig:clustering:design}b, see [Methods](#sec:methods:clustering)).
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/05.discussion.md b/tests/manuscripts/phenoplier_full_only_first_para/05.discussion.md
new file mode 100644
index 0000000..e300d00
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/05.discussion.md
@@ -0,0 +1,8 @@
+## Discussion
+
+We have introduced a novel computational strategy that integrates statistical associations from TWAS with groups of genes (gene modules) that have similar expression patterns across the same cell types.
+Our key innovation is that we project gene-trait associations through a latent representation derived not strictly from measures of normal tissue but also from cell types under a variety of stimuli and at various developmental stages.
+This improves interpretation by going beyond statistical associations to infer cell type-specific features of complex phenotypes.
+Our approach can identify disease-relevant cell types from summary statistics, and several disease-associated gene modules were replicated in eMERGE.
+Using a CRISPR screen to analyze lipid regulation, we found that our gene module-based approach can prioritize causal genes even when single gene associations are not detected.
+We interpret these findings with an omnigenic perspective of "core" and "peripheral" genes, suggesting that the approach can identify genes that directly affect the trait with no mediated regulation of other genes and thus prioritize alternative and potentially more attractive therapeutic targets.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/07.00.methods.md b/tests/manuscripts/phenoplier_full_only_first_para/07.00.methods.md
new file mode 100644
index 0000000..521262f
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/07.00.methods.md
@@ -0,0 +1,9 @@
+## Methods {#sec:methods}
+
+PhenoPLIER is a framework that combines different computational approaches to integrate gene-trait associations and drug-induced transcriptional responses with groups of functionally-related genes (referred to as gene modules or latent variables/LVs).
+Gene-trait associations are computed using the PrediXcan family of methods, whereas latent variables are inferred by the MultiPLIER models applied on large gene expression compendia.
+PhenoPLIER provides
+1) a regression model to compute an LV-trait association,
+2) a consensus clustering approach applied to the latent space to learn shared and distinct transcriptomic properties between traits, and
+3) an interpretable, LV-based drug repurposing framework.
+We provide the details of these methods below.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/10.references.md b/tests/manuscripts/phenoplier_full_only_first_para/10.references.md
new file mode 100644
index 0000000..339b33c
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/10.references.md
@@ -0,0 +1,4 @@
+## References {.page_break_before}
+
+<!-- Explicitly insert bibliography here -->
+<div id="refs"></div>
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/15.acknowledgements.md b/tests/manuscripts/phenoplier_full_only_first_para/15.acknowledgements.md
new file mode 100644
index 0000000..58d6e0a
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/15.acknowledgements.md
@@ -0,0 +1,13 @@
+## Acknowledgements
+
+This study was funded by:
+the Gordon and Betty Moore Foundation (GBMF 4552 to C.S. Greene; GBMF 4560 to B.D. Sullivan),
+the National Human Genome Research Institute (R01 HG010067 to C.S. Greene, S.F.A. Grant and B.D. Sullivan; K99 HG011898 and R00 HG011898 to M. Pividori; U01 HG011181 to W. Wei),
+the National Cancer Institute (R01 CA237170 to C.S. Greene),
+the Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765 to C.S. Greene),
+the National Institute of Aging (R01AG069900 to W. Wei),
+the National Institute of General Medical Sciences (R01 GM139891 to W. Wei);
+the National Heart, Lung, and Blood Institute (R01 HL163854 to Q. Feng);
+the National Institute of Diabetes and Digestive and Kidney Diseases (DK126194 to B.F. Voight);
+the Daniel B. Burke Endowed Chair for Diabetes Research to S.F.A. Grant;
+the Robert L. McNeil Jr. Endowed Fellowship in Translational Medicine and Therapeutics to C. Skarke.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/50.00.supplementary_material.md b/tests/manuscripts/phenoplier_full_only_first_para/50.00.supplementary_material.md
new file mode 100644
index 0000000..98397f2
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/50.00.supplementary_material.md
@@ -0,0 +1,13 @@
+\clearpage
+
+## Supplementary information {.page_break_before}
+
+### Supplementary Note 1: mean type I error rates and calibration of LV-based regression model {#sm:reg:null_sim}
+
+We assessed our GLS model type I error rates (proportion of $p$-values below 0.05) and calibration using a null model of random traits and genotype data from 1000 Genomes Phase III.
+We selected 312 individuals with European ancestry, and then analyzed 1,000 traits drawn from a standard normal distribution $\mathcal{N}(0,1)$.
+We ran all the standard procedures for the TWAS approaches (S-PrediXcan and S-MultiXcan), including:
+1) a standard GWAS using linear regression under an additive genetic model,
+2) different GWAS processing steps, including harmonization and imputation procedures as defined in [@doi:10.1002/gepi.22346],
+3) S-PrediXcan and S-MultiXcan analyses.
+Below we provide details for each of these steps.
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/manual-references.json b/tests/manuscripts/phenoplier_full_only_first_para/manual-references.json
new file mode 100644
index 0000000..8ae1023
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/manual-references.json
@@ -0,0 +1,9296 @@
+[
+  {
+    "type": "article-journal",
+    "id": "MxGpAiPu",
+    "container-title": "Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms",
+    "author": [
+      {
+        "family": "Arthur",
+        "given": "David"
+      },
+      {
+        "family": "Vassilvitskii",
+        "given": "Sergei"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2007
+        ]
+      ]
+    },
+    "page": "1027-1035",
+    "title": "k-means++: the advantages of careful seeding",
+    "URL": "http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Arthur2007"
+  },
+  {
+    "type": "article-journal",
+    "id": "JrL3iQea",
+    "container-title": "Proceedings of the Second International Conference on Knowledge Discovery and Data Mining",
+    "author": [
+      {
+        "family": "Ester",
+        "given": "Martin"
+      },
+      {
+        "family": "Kriegel",
+        "given": "Hans-Peter"
+      },
+      {
+        "family": "Sander",
+        "given": "Jörg"
+      },
+      {
+        "family": "Xu",
+        "given": "Xiaowei"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          1996
+        ]
+      ]
+    },
+    "page": "226-231",
+    "title": "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise",
+    "URL": "https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Ester1996"
+  },
+  {
+    "type": "article-journal",
+    "id": "x3CT24TB",
+    "container-title": "Advances in Neural Information Processing Systems",
+    "author": [
+      {
+        "family": "Ng",
+        "given": "Andrew"
+      },
+      {
+        "family": "Jordan",
+        "given": "Michael"
+      },
+      {
+        "family": "Weiss",
+        "given": "Yair"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2001
+        ]
+      ]
+    },
+    "page": "849-856",
+    "title": "On Spectral Clustering: Analysis and an algorithm",
+    "URL": "https://ai.stanford.edu/~ang/papers/nips01-spectral.pdf",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Ng2001"
+  },
+  {
+    "type": "article-journal",
+    "id": "uw3AnEgA",
+    "container-title": "Journal of Machine Learning Research",
+    "author": [
+      {
+        "family": "Strehl",
+        "given": "Alexander"
+      },
+      {
+        "family": "Joydeep",
+        "given": "Ghosh"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2002
+        ]
+      ]
+    },
+    "volume": "3",
+    "page": "583-617",
+    "title": "Cluster Ensembles – A Knowledge Reuse Framework for Combining Multiple Partitions",
+    "URL": "https://www.jmlr.org/papers/v3/strehl02a.html",
+    "note": "Loaded from an external bibliography file by Manubot.\nsource_bibliography: manual-references.json\nstandard_id: Strehl2002"
+  },
+  {
+    "id": "157h5hA34",
+    "URL": "https://arxiv.org/abs/1802.03426",
+    "number": "1802.03426",
+    "title": "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          21
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Leland",
+        "family": "McInnes"
+      },
+      {
+        "given": "John",
+        "family": "Healy"
+      },
+      {
+        "given": "James",
+        "family": "Melville"
+      }
+    ],
+    "container-title": "arXiv",
+    "publisher": "arXiv",
+    "type": "report",
+    "abstract": "  UMAP (Uniform Manifold Approximation and Projection) is a novel manifold learning technique for dimension reduction. UMAP is constructed from a theoretical framework based in Riemannian geometry and algebraic topology. The result is a practical scalable algorithm that applies to real world data. The UMAP algorithm is competitive with t-SNE for visualization quality, and arguably preserves more of the global structure with superior run time performance. Furthermore, UMAP has no computational restrictions on embedding dimension, making it viable as a general purpose dimension reduction technique for machine learning. ",
+    "note": "license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: arxiv:1802.03426"
+  },
+  {
+    "publisher": "Wiley",
+    "issue": "8",
+    "DOI": "10.1002/gepi.22346",
+    "type": "article-journal",
+    "page": "854-867",
+    "source": "Crossref",
+    "title": "Fine‐mapping and QTL tissue‐sharing information improves the reliability of causal gene identification",
+    "volume": "44",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Owen J.",
+        "family": "Melia"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Gao",
+        "family": "Wang"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "François",
+        "family": "Aguet"
+      },
+      {
+        "given": "Kristin G.",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Hae K.",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Genetic Epidemiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqsvf7",
+    "container-title-short": "Genetic Epidemiology",
+    "PMCID": "PMC7693040",
+    "PMID": "32964524",
+    "id": "SiobXsoB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1002/gepi.22346"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1007/bf01908075",
+    "type": "article-journal",
+    "page": "193-218",
+    "source": "Crossref",
+    "title": "Comparing partitions",
+    "volume": "2",
+    "author": [
+      {
+        "given": "Lawrence",
+        "family": "Hubert"
+      },
+      {
+        "given": "Phipps",
+        "family": "Arabie"
+      }
+    ],
+    "container-title": "Journal of Classification",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          1985,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bphmzh",
+    "container-title-short": "Journal of Classification",
+    "id": "e4AuoW8N",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1007/bf01908075"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "5",
+    "DOI": "10.1016/j.ajhg.2018.04.002",
+    "type": "article-journal",
+    "page": "717-730",
+    "source": "Crossref",
+    "title": "The Post-GWAS Era: From Association to Function",
+    "volume": "102",
+    "author": [
+      {
+        "given": "Michael D.",
+        "family": "Gallagher"
+      },
+      {
+        "given": "Alice S.",
+        "family": "Chen-Plotkin"
+      }
+    ],
+    "container-title": "The American Journal of Human Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdmftd",
+    "container-title-short": "The American Journal of Human Genetics",
+    "PMCID": "PMC5986732",
+    "PMID": "29727686",
+    "id": "aIyQY5ZT",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ajhg.2018.04.002"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "1",
+    "DOI": "10.1016/j.ajhg.2020.11.012",
+    "type": "article-journal",
+    "page": "25-35",
+    "source": "Crossref",
+    "title": "Probabilistic colocalization of genetic variants from complex and molecular traits: promise and limitations",
+    "volume": "108",
+    "author": [
+      {
+        "given": "Abhay",
+        "family": "Hukku"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Francesca",
+        "family": "Luca"
+      },
+      {
+        "given": "Roger",
+        "family": "Pique-Regi"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      }
+    ],
+    "container-title": "The American Journal of Human Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj58gg",
+    "container-title-short": "The American Journal of Human Genetics",
+    "PMCID": "PMC7820626",
+    "PMID": "33308443",
+    "id": "ndd3tW4g",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ajhg.2020.11.012"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "8",
+    "DOI": "10.1016/j.amjcard.2008.02.029",
+    "type": "article-journal",
+    "page": "S20-S26",
+    "source": "Crossref",
+    "title": "Mechanism of Action of Niacin",
+    "volume": "101",
+    "author": [
+      {
+        "given": "Vaijinath S.",
+        "family": "Kamanna"
+      },
+      {
+        "given": "Moti L.",
+        "family": "Kashyap"
+      }
+    ],
+    "container-title": "The American Journal of Cardiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/c8zwdt",
+    "container-title-short": "The American Journal of Cardiology",
+    "PMID": "18375237",
+    "id": "LVihFr3g",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.amjcard.2008.02.029"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "2",
+    "DOI": "10.1016/j.cell.2011.01.004",
+    "type": "article-journal",
+    "page": "296-309",
+    "source": "Crossref",
+    "title": "Densely Interconnected Transcriptional Circuits Control Cell States in Human Hematopoiesis",
+    "volume": "144",
+    "author": [
+      {
+        "given": "Noa",
+        "family": "Novershtern"
+      },
+      {
+        "given": "Aravind",
+        "family": "Subramanian"
+      },
+      {
+        "given": "Lee N.",
+        "family": "Lawton"
+      },
+      {
+        "given": "Raymond H.",
+        "family": "Mak"
+      },
+      {
+        "given": "W. Nicholas",
+        "family": "Haining"
+      },
+      {
+        "given": "Marie E.",
+        "family": "McConkey"
+      },
+      {
+        "given": "Naomi",
+        "family": "Habib"
+      },
+      {
+        "given": "Nir",
+        "family": "Yosef"
+      },
+      {
+        "given": "Cindy Y.",
+        "family": "Chang"
+      },
+      {
+        "given": "Tal",
+        "family": "Shay"
+      },
+      {
+        "given": "Garrett M.",
+        "family": "Frampton"
+      },
+      {
+        "given": "Adam C.B.",
+        "family": "Drake"
+      },
+      {
+        "given": "Ilya",
+        "family": "Leskov"
+      },
+      {
+        "given": "Bjorn",
+        "family": "Nilsson"
+      },
+      {
+        "given": "Fred",
+        "family": "Preffer"
+      },
+      {
+        "given": "David",
+        "family": "Dombkowski"
+      },
+      {
+        "given": "John W.",
+        "family": "Evans"
+      },
+      {
+        "given": "Ted",
+        "family": "Liefeld"
+      },
+      {
+        "given": "John S.",
+        "family": "Smutko"
+      },
+      {
+        "given": "Jianzhu",
+        "family": "Chen"
+      },
+      {
+        "given": "Nir",
+        "family": "Friedman"
+      },
+      {
+        "given": "Richard A.",
+        "family": "Young"
+      },
+      {
+        "given": "Todd R.",
+        "family": "Golub"
+      },
+      {
+        "given": "Aviv",
+        "family": "Regev"
+      },
+      {
+        "given": "Benjamin L.",
+        "family": "Ebert"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cf5k92",
+    "container-title-short": "Cell",
+    "PMCID": "PMC3049864",
+    "PMID": "21241896",
+    "id": "Zk82GvJV",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2011.01.004"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "7",
+    "DOI": "10.1016/j.cell.2017.05.038",
+    "type": "article-journal",
+    "page": "1177-1186",
+    "source": "Crossref",
+    "title": "An Expanded View of Complex Traits: From Polygenic to Omnigenic",
+    "volume": "169",
+    "author": [
+      {
+        "given": "Evan A.",
+        "family": "Boyle"
+      },
+      {
+        "given": "Yang I.",
+        "family": "Li"
+      },
+      {
+        "given": "Jonathan K.",
+        "family": "Pritchard"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcpgdz",
+    "container-title-short": "Cell",
+    "PMCID": "PMC5536862",
+    "PMID": "28622505",
+    "id": "vpIDZCSa",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2017.05.038"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "6",
+    "DOI": "10.1016/j.cell.2017.10.049",
+    "type": "article-journal",
+    "page": "1437-1452.e17",
+    "source": "Crossref",
+    "title": "A Next Generation Connectivity Map: L1000 Platform and the First 1,000,000 Profiles",
+    "volume": "171",
+    "author": [
+      {
+        "given": "Aravind",
+        "family": "Subramanian"
+      },
+      {
+        "given": "Rajiv",
+        "family": "Narayan"
+      },
+      {
+        "given": "Steven M.",
+        "family": "Corsello"
+      },
+      {
+        "given": "David D.",
+        "family": "Peck"
+      },
+      {
+        "given": "Ted E.",
+        "family": "Natoli"
+      },
+      {
+        "given": "Xiaodong",
+        "family": "Lu"
+      },
+      {
+        "given": "Joshua",
+        "family": "Gould"
+      },
+      {
+        "given": "John F.",
+        "family": "Davis"
+      },
+      {
+        "given": "Andrew A.",
+        "family": "Tubelli"
+      },
+      {
+        "given": "Jacob K.",
+        "family": "Asiedu"
+      },
+      {
+        "given": "David L.",
+        "family": "Lahr"
+      },
+      {
+        "given": "Jodi E.",
+        "family": "Hirschman"
+      },
+      {
+        "given": "Zihan",
+        "family": "Liu"
+      },
+      {
+        "given": "Melanie",
+        "family": "Donahue"
+      },
+      {
+        "given": "Bina",
+        "family": "Julian"
+      },
+      {
+        "given": "Mariya",
+        "family": "Khan"
+      },
+      {
+        "given": "David",
+        "family": "Wadden"
+      },
+      {
+        "given": "Ian C.",
+        "family": "Smith"
+      },
+      {
+        "given": "Daniel",
+        "family": "Lam"
+      },
+      {
+        "given": "Arthur",
+        "family": "Liberzon"
+      },
+      {
+        "given": "Courtney",
+        "family": "Toder"
+      },
+      {
+        "given": "Mukta",
+        "family": "Bagul"
+      },
+      {
+        "given": "Marek",
+        "family": "Orzechowski"
+      },
+      {
+        "given": "Oana M.",
+        "family": "Enache"
+      },
+      {
+        "given": "Federica",
+        "family": "Piccioni"
+      },
+      {
+        "given": "Sarah A.",
+        "family": "Johnson"
+      },
+      {
+        "given": "Nicholas J.",
+        "family": "Lyons"
+      },
+      {
+        "given": "Alice H.",
+        "family": "Berger"
+      },
+      {
+        "given": "Alykhan F.",
+        "family": "Shamji"
+      },
+      {
+        "given": "Angela N.",
+        "family": "Brooks"
+      },
+      {
+        "given": "Anita",
+        "family": "Vrcic"
+      },
+      {
+        "given": "Corey",
+        "family": "Flynn"
+      },
+      {
+        "given": "Jacqueline",
+        "family": "Rosains"
+      },
+      {
+        "given": "David Y.",
+        "family": "Takeda"
+      },
+      {
+        "given": "Roger",
+        "family": "Hu"
+      },
+      {
+        "given": "Desiree",
+        "family": "Davison"
+      },
+      {
+        "given": "Justin",
+        "family": "Lamb"
+      },
+      {
+        "given": "Kristin",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Larson",
+        "family": "Hogstrom"
+      },
+      {
+        "given": "Peyton",
+        "family": "Greenside"
+      },
+      {
+        "given": "Nathanael S.",
+        "family": "Gray"
+      },
+      {
+        "given": "Paul A.",
+        "family": "Clemons"
+      },
+      {
+        "given": "Serena",
+        "family": "Silver"
+      },
+      {
+        "given": "Xiaoyun",
+        "family": "Wu"
+      },
+      {
+        "given": "Wen-Ning",
+        "family": "Zhao"
+      },
+      {
+        "given": "Willis",
+        "family": "Read-Button"
+      },
+      {
+        "given": "Xiaohua",
+        "family": "Wu"
+      },
+      {
+        "given": "Stephen J.",
+        "family": "Haggarty"
+      },
+      {
+        "given": "Lucienne V.",
+        "family": "Ronco"
+      },
+      {
+        "given": "Jesse S.",
+        "family": "Boehm"
+      },
+      {
+        "given": "Stuart L.",
+        "family": "Schreiber"
+      },
+      {
+        "given": "John G.",
+        "family": "Doench"
+      },
+      {
+        "given": "Joshua A.",
+        "family": "Bittker"
+      },
+      {
+        "given": "David E.",
+        "family": "Root"
+      },
+      {
+        "given": "Bang",
+        "family": "Wong"
+      },
+      {
+        "given": "Todd R.",
+        "family": "Golub"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cgwt",
+    "container-title-short": "Cell",
+    "PMCID": "PMC5990023",
+    "PMID": "29195078",
+    "id": "F7lIlh2N",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2017.10.049"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "4",
+    "DOI": "10.1016/j.cell.2019.04.014",
+    "type": "article-journal",
+    "page": "1022-1034.e6",
+    "source": "Crossref",
+    "title": "Trans Effects on Gene Expression Can Drive Omnigenic Inheritance",
+    "volume": "177",
+    "author": [
+      {
+        "given": "Xuanyao",
+        "family": "Liu"
+      },
+      {
+        "given": "Yang I.",
+        "family": "Li"
+      },
+      {
+        "given": "Jonathan K.",
+        "family": "Pritchard"
+      }
+    ],
+    "container-title": "Cell",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfz8bj",
+    "container-title-short": "Cell",
+    "PMCID": "PMC6553491",
+    "PMID": "31051098",
+    "id": "LXvTZzEA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cell.2019.04.014"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "6",
+    "DOI": "10.1016/j.cels.2015.12.004",
+    "type": "article-journal",
+    "page": "417-425",
+    "source": "Crossref",
+    "title": "The Molecular Signatures Database Hallmark Gene Set Collection",
+    "volume": "1",
+    "author": [
+      {
+        "given": "Arthur",
+        "family": "Liberzon"
+      },
+      {
+        "given": "Chet",
+        "family": "Birger"
+      },
+      {
+        "given": "Helga",
+        "family": "Thorvaldsdóttir"
+      },
+      {
+        "given": "Mahmoud",
+        "family": "Ghandi"
+      },
+      {
+        "given": "Jill P.",
+        "family": "Mesirov"
+      },
+      {
+        "given": "Pablo",
+        "family": "Tamayo"
+      }
+    ],
+    "container-title": "Cell Systems",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf78hq",
+    "container-title-short": "Cell Systems",
+    "PMCID": "PMC4707969",
+    "PMID": "26771021",
+    "id": "1CbVoEpNJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cels.2015.12.004"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "5",
+    "DOI": "10.1016/j.cels.2019.04.003",
+    "type": "article-journal",
+    "page": "380-394.e4",
+    "source": "Crossref",
+    "title": "MultiPLIER: A Transfer Learning Framework for Transcriptomics Reveals Systemic Features of Rare Disease",
+    "volume": "8",
+    "author": [
+      {
+        "given": "Jaclyn N.",
+        "family": "Taroni"
+      },
+      {
+        "given": "Peter C.",
+        "family": "Grayson"
+      },
+      {
+        "given": "Qiwen",
+        "family": "Hu"
+      },
+      {
+        "given": "Sean",
+        "family": "Eddy"
+      },
+      {
+        "given": "Matthias",
+        "family": "Kretzler"
+      },
+      {
+        "given": "Peter A.",
+        "family": "Merkel"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      }
+    ],
+    "container-title": "Cell Systems",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          5
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75g5",
+    "container-title-short": "Cell Systems",
+    "PMCID": "PMC6538307",
+    "PMID": "31121115",
+    "id": "14rnBunuZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.cels.2019.04.003"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "DOI": "10.1016/j.ins.2016.04.027",
+    "type": "article-journal",
+    "page": "120-134",
+    "source": "Crossref",
+    "title": "Diversity control for improving the analysis of consensus clustering",
+    "volume": "361-362",
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Georgina",
+        "family": "Stegmayer"
+      },
+      {
+        "given": "Diego H.",
+        "family": "Milone"
+      }
+    ],
+    "container-title": "Information Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghtqbk",
+    "container-title-short": "Information Sciences",
+    "id": "8js8Q3pF",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.ins.2016.04.027"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "10",
+    "DOI": "10.1016/j.patcog.2014.04.005",
+    "type": "article-journal",
+    "page": "3362-3375",
+    "source": "Crossref",
+    "title": "Hybrid clustering solution selection strategy",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Zhiwen",
+        "family": "Yu"
+      },
+      {
+        "given": "Le",
+        "family": "Li"
+      },
+      {
+        "given": "Yunjun",
+        "family": "Gao"
+      },
+      {
+        "given": "Jane",
+        "family": "You"
+      },
+      {
+        "given": "Jiming",
+        "family": "Liu"
+      },
+      {
+        "given": "Hau-San",
+        "family": "Wong"
+      },
+      {
+        "given": "Guoqiang",
+        "family": "Han"
+      }
+    ],
+    "container-title": "Pattern Recognition",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghtzwt",
+    "container-title-short": "Pattern Recognition",
+    "id": "t5p3UpxZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.patcog.2014.04.005"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "7",
+    "DOI": "10.1016/j.tips.2006.05.008",
+    "type": "article-journal",
+    "page": "384-390",
+    "source": "Crossref",
+    "title": "The nicotinic acid receptor GPR109A (HM74A or PUMA-G) as a new therapeutic target",
+    "volume": "27",
+    "author": [
+      {
+        "given": "S",
+        "family": "OFFERMANNS"
+      }
+    ],
+    "container-title": "Trends in Pharmacological Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          7
+        ]
+      ]
+    },
+    "URL": "https://doi.org/fgb4tr",
+    "container-title-short": "Trends in Pharmacological Sciences",
+    "PMID": "16766048",
+    "id": "izbPw2kc",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1016/j.tips.2006.05.008"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "10",
+    "DOI": "10.1038/gim.2013.72",
+    "type": "article-journal",
+    "page": "761-771",
+    "source": "Crossref",
+    "title": "The Electronic Medical Records and Genomics (eMERGE) Network: past, present, and future",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Omri",
+        "family": "Gottesman"
+      },
+      {
+        "given": "Helena",
+        "family": "Kuivaniemi"
+      },
+      {
+        "given": "Gerard",
+        "family": "Tromp"
+      },
+      {
+        "given": "W. Andrew",
+        "family": "Faucett"
+      },
+      {
+        "given": "Rongling",
+        "family": "Li"
+      },
+      {
+        "given": "Teri A.",
+        "family": "Manolio"
+      },
+      {
+        "given": "Saskia C.",
+        "family": "Sanderson"
+      },
+      {
+        "given": "Joseph",
+        "family": "Kannry"
+      },
+      {
+        "given": "Randi",
+        "family": "Zinberg"
+      },
+      {
+        "given": "Melissa A.",
+        "family": "Basford"
+      },
+      {
+        "given": "Murray",
+        "family": "Brilliant"
+      },
+      {
+        "given": "David J.",
+        "family": "Carey"
+      },
+      {
+        "given": "Rex L.",
+        "family": "Chisholm"
+      },
+      {
+        "given": "Christopher G.",
+        "family": "Chute"
+      },
+      {
+        "given": "John J.",
+        "family": "Connolly"
+      },
+      {
+        "given": "David",
+        "family": "Crosslin"
+      },
+      {
+        "given": "Joshua C.",
+        "family": "Denny"
+      },
+      {
+        "given": "Carlos J.",
+        "family": "Gallego"
+      },
+      {
+        "given": "Jonathan L.",
+        "family": "Haines"
+      },
+      {
+        "given": "Hakon",
+        "family": "Hakonarson"
+      },
+      {
+        "given": "John",
+        "family": "Harley"
+      },
+      {
+        "given": "Gail P.",
+        "family": "Jarvik"
+      },
+      {
+        "given": "Isaac",
+        "family": "Kohane"
+      },
+      {
+        "given": "Iftikhar J.",
+        "family": "Kullo"
+      },
+      {
+        "given": "Eric B.",
+        "family": "Larson"
+      },
+      {
+        "given": "Catherine",
+        "family": "McCarty"
+      },
+      {
+        "given": "Marylyn D.",
+        "family": "Ritchie"
+      },
+      {
+        "given": "Dan M.",
+        "family": "Roden"
+      },
+      {
+        "given": "Maureen E.",
+        "family": "Smith"
+      },
+      {
+        "given": "Erwin P.",
+        "family": "Böttinger"
+      },
+      {
+        "given": "Marc S.",
+        "family": "Williams"
+      }
+    ],
+    "container-title": "Genetics in Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f5dwbt",
+    "container-title-short": "Genetics in Medicine",
+    "PMCID": "PMC3795928",
+    "PMID": "23743551",
+    "id": "wfqjCerX",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/gim.2013.72"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7414",
+    "DOI": "10.1038/nature11247",
+    "type": "article-journal",
+    "page": "57-74",
+    "source": "Crossref",
+    "title": "An integrated encyclopedia of DNA elements in the human genome",
+    "volume": "489",
+    "author": [
+      {}
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2012,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bg9d",
+    "container-title-short": "Nature",
+    "PMCID": "PMC3439153",
+    "PMID": "22955616",
+    "id": "15J98V2qM",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature11247"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7493",
+    "DOI": "10.1038/nature12787",
+    "type": "article-journal",
+    "page": "455-461",
+    "source": "Crossref",
+    "title": "An atlas of active enhancers across human cell types and tissues",
+    "volume": "507",
+    "author": [
+      {
+        "given": "Robin",
+        "family": "Andersson"
+      },
+      {},
+      {
+        "given": "Claudia",
+        "family": "Gebhard"
+      },
+      {
+        "given": "Irene",
+        "family": "Miguel-Escalada"
+      },
+      {
+        "given": "Ilka",
+        "family": "Hoof"
+      },
+      {
+        "given": "Jette",
+        "family": "Bornholdt"
+      },
+      {
+        "given": "Mette",
+        "family": "Boyd"
+      },
+      {
+        "given": "Yun",
+        "family": "Chen"
+      },
+      {
+        "given": "Xiaobei",
+        "family": "Zhao"
+      },
+      {
+        "given": "Christian",
+        "family": "Schmidl"
+      },
+      {
+        "given": "Takahiro",
+        "family": "Suzuki"
+      },
+      {
+        "given": "Evgenia",
+        "family": "Ntini"
+      },
+      {
+        "given": "Erik",
+        "family": "Arner"
+      },
+      {
+        "given": "Eivind",
+        "family": "Valen"
+      },
+      {
+        "given": "Kang",
+        "family": "Li"
+      },
+      {
+        "given": "Lucia",
+        "family": "Schwarzfischer"
+      },
+      {
+        "given": "Dagmar",
+        "family": "Glatz"
+      },
+      {
+        "given": "Johanna",
+        "family": "Raithel"
+      },
+      {
+        "given": "Berit",
+        "family": "Lilje"
+      },
+      {
+        "given": "Nicolas",
+        "family": "Rapin"
+      },
+      {
+        "given": "Frederik Otzen",
+        "family": "Bagger"
+      },
+      {
+        "given": "Mette",
+        "family": "Jørgensen"
+      },
+      {
+        "given": "Peter Refsing",
+        "family": "Andersen"
+      },
+      {
+        "given": "Nicolas",
+        "family": "Bertin"
+      },
+      {
+        "given": "Owen",
+        "family": "Rackham"
+      },
+      {
+        "given": "A. Maxwell",
+        "family": "Burroughs"
+      },
+      {
+        "given": "J. Kenneth",
+        "family": "Baillie"
+      },
+      {
+        "given": "Yuri",
+        "family": "Ishizu"
+      },
+      {
+        "given": "Yuri",
+        "family": "Shimizu"
+      },
+      {
+        "given": "Erina",
+        "family": "Furuhata"
+      },
+      {
+        "given": "Shiori",
+        "family": "Maeda"
+      },
+      {
+        "given": "Yutaka",
+        "family": "Negishi"
+      },
+      {
+        "given": "Christopher J.",
+        "family": "Mungall"
+      },
+      {
+        "given": "Terrence F.",
+        "family": "Meehan"
+      },
+      {
+        "given": "Timo",
+        "family": "Lassmann"
+      },
+      {
+        "given": "Masayoshi",
+        "family": "Itoh"
+      },
+      {
+        "given": "Hideya",
+        "family": "Kawaji"
+      },
+      {
+        "given": "Naoto",
+        "family": "Kondo"
+      },
+      {
+        "given": "Jun",
+        "family": "Kawai"
+      },
+      {
+        "given": "Andreas",
+        "family": "Lennartsson"
+      },
+      {
+        "given": "Carsten O.",
+        "family": "Daub"
+      },
+      {
+        "given": "Peter",
+        "family": "Heutink"
+      },
+      {
+        "given": "David A.",
+        "family": "Hume"
+      },
+      {
+        "given": "Torben Heick",
+        "family": "Jensen"
+      },
+      {
+        "given": "Harukazu",
+        "family": "Suzuki"
+      },
+      {
+        "given": "Yoshihide",
+        "family": "Hayashizaki"
+      },
+      {
+        "given": "Ferenc",
+        "family": "Müller"
+      },
+      {
+        "given": "Alistair R. R.",
+        "family": "Forrest"
+      },
+      {
+        "given": "Piero",
+        "family": "Carninci"
+      },
+      {
+        "given": "Michael",
+        "family": "Rehli"
+      },
+      {
+        "given": "Albin",
+        "family": "Sandelin"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/r35",
+    "container-title-short": "Nature",
+    "PMCID": "PMC5215096",
+    "PMID": "24670763",
+    "id": "SxuuTQTQ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature12787"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7539",
+    "DOI": "10.1038/nature14248",
+    "type": "article-journal",
+    "page": "317-330",
+    "source": "Crossref",
+    "title": "Integrative analysis of 111 reference human epigenomes",
+    "volume": "518",
+    "author": [
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      },
+      {},
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Jason",
+        "family": "Ernst"
+      },
+      {
+        "given": "Misha",
+        "family": "Bilenky"
+      },
+      {
+        "given": "Angela",
+        "family": "Yen"
+      },
+      {
+        "given": "Alireza",
+        "family": "Heravi-Moussavi"
+      },
+      {
+        "given": "Pouya",
+        "family": "Kheradpour"
+      },
+      {
+        "given": "Zhizhuo",
+        "family": "Zhang"
+      },
+      {
+        "given": "Jianrong",
+        "family": "Wang"
+      },
+      {
+        "given": "Michael J.",
+        "family": "Ziller"
+      },
+      {
+        "given": "Viren",
+        "family": "Amin"
+      },
+      {
+        "given": "John W.",
+        "family": "Whitaker"
+      },
+      {
+        "given": "Matthew D.",
+        "family": "Schultz"
+      },
+      {
+        "given": "Lucas D.",
+        "family": "Ward"
+      },
+      {
+        "given": "Abhishek",
+        "family": "Sarkar"
+      },
+      {
+        "given": "Gerald",
+        "family": "Quon"
+      },
+      {
+        "given": "Richard S.",
+        "family": "Sandstrom"
+      },
+      {
+        "given": "Matthew L.",
+        "family": "Eaton"
+      },
+      {
+        "given": "Yi-Chieh",
+        "family": "Wu"
+      },
+      {
+        "given": "Andreas R.",
+        "family": "Pfenning"
+      },
+      {
+        "given": "Xinchen",
+        "family": "Wang"
+      },
+      {
+        "given": "Melina",
+        "family": "Claussnitzer"
+      },
+      {
+        "given": "Yaping",
+        "family": "Liu"
+      },
+      {
+        "given": "Cristian",
+        "family": "Coarfa"
+      },
+      {
+        "given": "R. Alan",
+        "family": "Harris"
+      },
+      {
+        "given": "Noam",
+        "family": "Shoresh"
+      },
+      {
+        "given": "Charles B.",
+        "family": "Epstein"
+      },
+      {
+        "given": "Elizabeta",
+        "family": "Gjoneska"
+      },
+      {
+        "given": "Danny",
+        "family": "Leung"
+      },
+      {
+        "given": "Wei",
+        "family": "Xie"
+      },
+      {
+        "given": "R. David",
+        "family": "Hawkins"
+      },
+      {
+        "given": "Ryan",
+        "family": "Lister"
+      },
+      {
+        "given": "Chibo",
+        "family": "Hong"
+      },
+      {
+        "given": "Philippe",
+        "family": "Gascard"
+      },
+      {
+        "given": "Andrew J.",
+        "family": "Mungall"
+      },
+      {
+        "given": "Richard",
+        "family": "Moore"
+      },
+      {
+        "given": "Eric",
+        "family": "Chuah"
+      },
+      {
+        "given": "Angela",
+        "family": "Tam"
+      },
+      {
+        "given": "Theresa K.",
+        "family": "Canfield"
+      },
+      {
+        "given": "R. Scott",
+        "family": "Hansen"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "Peter J.",
+        "family": "Sabo"
+      },
+      {
+        "given": "Mukul S.",
+        "family": "Bansal"
+      },
+      {
+        "given": "Annaick",
+        "family": "Carles"
+      },
+      {
+        "given": "Jesse R.",
+        "family": "Dixon"
+      },
+      {
+        "given": "Kai-How",
+        "family": "Farh"
+      },
+      {
+        "given": "Soheil",
+        "family": "Feizi"
+      },
+      {
+        "given": "Rosa",
+        "family": "Karlic"
+      },
+      {
+        "given": "Ah-Ram",
+        "family": "Kim"
+      },
+      {
+        "given": "Ashwinikumar",
+        "family": "Kulkarni"
+      },
+      {
+        "given": "Daofeng",
+        "family": "Li"
+      },
+      {
+        "given": "Rebecca",
+        "family": "Lowdon"
+      },
+      {
+        "given": "GiNell",
+        "family": "Elliott"
+      },
+      {
+        "given": "Tim R.",
+        "family": "Mercer"
+      },
+      {
+        "given": "Shane J.",
+        "family": "Neph"
+      },
+      {
+        "given": "Vitor",
+        "family": "Onuchic"
+      },
+      {
+        "given": "Paz",
+        "family": "Polak"
+      },
+      {
+        "given": "Nisha",
+        "family": "Rajagopal"
+      },
+      {
+        "given": "Pradipta",
+        "family": "Ray"
+      },
+      {
+        "given": "Richard C.",
+        "family": "Sallari"
+      },
+      {
+        "given": "Kyle T.",
+        "family": "Siebenthall"
+      },
+      {
+        "given": "Nicholas A.",
+        "family": "Sinnott-Armstrong"
+      },
+      {
+        "given": "Michael",
+        "family": "Stevens"
+      },
+      {
+        "given": "Robert E.",
+        "family": "Thurman"
+      },
+      {
+        "given": "Jie",
+        "family": "Wu"
+      },
+      {
+        "given": "Bo",
+        "family": "Zhang"
+      },
+      {
+        "given": "Xin",
+        "family": "Zhou"
+      },
+      {
+        "given": "Arthur E.",
+        "family": "Beaudet"
+      },
+      {
+        "given": "Laurie A.",
+        "family": "Boyer"
+      },
+      {
+        "given": "Philip L.",
+        "family": "De Jager"
+      },
+      {
+        "given": "Peggy J.",
+        "family": "Farnham"
+      },
+      {
+        "given": "Susan J.",
+        "family": "Fisher"
+      },
+      {
+        "given": "David",
+        "family": "Haussler"
+      },
+      {
+        "given": "Steven J. M.",
+        "family": "Jones"
+      },
+      {
+        "given": "Wei",
+        "family": "Li"
+      },
+      {
+        "given": "Marco A.",
+        "family": "Marra"
+      },
+      {
+        "given": "Michael T.",
+        "family": "McManus"
+      },
+      {
+        "given": "Shamil",
+        "family": "Sunyaev"
+      },
+      {
+        "given": "James A.",
+        "family": "Thomson"
+      },
+      {
+        "given": "Thea D.",
+        "family": "Tlsty"
+      },
+      {
+        "given": "Li-Huei",
+        "family": "Tsai"
+      },
+      {
+        "given": "Wei",
+        "family": "Wang"
+      },
+      {
+        "given": "Robert A.",
+        "family": "Waterland"
+      },
+      {
+        "given": "Michael Q.",
+        "family": "Zhang"
+      },
+      {
+        "given": "Lisa H.",
+        "family": "Chadwick"
+      },
+      {
+        "given": "Bradley E.",
+        "family": "Bernstein"
+      },
+      {
+        "given": "Joseph F.",
+        "family": "Costello"
+      },
+      {
+        "given": "Joseph R.",
+        "family": "Ecker"
+      },
+      {
+        "given": "Martin",
+        "family": "Hirst"
+      },
+      {
+        "given": "Alexander",
+        "family": "Meissner"
+      },
+      {
+        "given": "Aleksandar",
+        "family": "Milosavljevic"
+      },
+      {
+        "given": "Bing",
+        "family": "Ren"
+      },
+      {
+        "given": "John A.",
+        "family": "Stamatoyannopoulos"
+      },
+      {
+        "given": "Ting",
+        "family": "Wang"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          2,
+          18
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f62jpn",
+    "container-title-short": "Nature",
+    "PMCID": "PMC4530010",
+    "PMID": "25693563",
+    "id": "sLkFMFZj",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nature14248"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/nbt.3838",
+    "type": "article-journal",
+    "page": "319-321",
+    "source": "Crossref",
+    "title": "Reproducible RNA-seq analysis using recount2",
+    "volume": "35",
+    "author": [
+      {
+        "given": "Leonardo",
+        "family": "Collado-Torres"
+      },
+      {
+        "given": "Abhinav",
+        "family": "Nellore"
+      },
+      {
+        "given": "Kai",
+        "family": "Kammers"
+      },
+      {
+        "given": "Shannon E",
+        "family": "Ellis"
+      },
+      {
+        "given": "Margaret A",
+        "family": "Taub"
+      },
+      {
+        "given": "Kasper D",
+        "family": "Hansen"
+      },
+      {
+        "given": "Andrew E",
+        "family": "Jaffe"
+      },
+      {
+        "given": "Ben",
+        "family": "Langmead"
+      },
+      {
+        "given": "Jeffrey T",
+        "family": "Leek"
+      }
+    ],
+    "container-title": "Nature Biotechnology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75hp",
+    "container-title-short": "Nat Biotechnol",
+    "PMCID": "PMC6742427",
+    "PMID": "28398307",
+    "id": "6SPTvFXq",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nbt.3838"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/ncomms6890",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Biological interpretation of genome-wide association studies using predicted gene functions",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Tune H.",
+        "family": "Pers"
+      },
+      {},
+      {
+        "given": "Juha M.",
+        "family": "Karjalainen"
+      },
+      {
+        "given": "Yingleong",
+        "family": "Chan"
+      },
+      {
+        "given": "Harm-Jan",
+        "family": "Westra"
+      },
+      {
+        "given": "Andrew R.",
+        "family": "Wood"
+      },
+      {
+        "given": "Jian",
+        "family": "Yang"
+      },
+      {
+        "given": "Julian C.",
+        "family": "Lui"
+      },
+      {
+        "given": "Sailaja",
+        "family": "Vedantam"
+      },
+      {
+        "given": "Stefan",
+        "family": "Gustafsson"
+      },
+      {
+        "given": "Tonu",
+        "family": "Esko"
+      },
+      {
+        "given": "Tim",
+        "family": "Frayling"
+      },
+      {
+        "given": "Elizabeth K.",
+        "family": "Speliotes"
+      },
+      {
+        "given": "Michael",
+        "family": "Boehnke"
+      },
+      {
+        "given": "Soumya",
+        "family": "Raychaudhuri"
+      },
+      {
+        "given": "Rudolf S. N.",
+        "family": "Fehrmann"
+      },
+      {
+        "given": "Joel N.",
+        "family": "Hirschhorn"
+      },
+      {
+        "given": "Lude",
+        "family": "Franke"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          1,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mwhd",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC4420238",
+    "PMID": "25597830",
+    "id": "z8MQTAnJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ncomms6890"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "6",
+    "DOI": "10.1038/ng.3259",
+    "type": "article-journal",
+    "page": "569-576",
+    "source": "Crossref",
+    "title": "Understanding multicellular function and disease with human tissue-specific networks",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Casey S",
+        "family": "Greene"
+      },
+      {
+        "given": "Arjun",
+        "family": "Krishnan"
+      },
+      {
+        "given": "Aaron K",
+        "family": "Wong"
+      },
+      {
+        "given": "Emanuela",
+        "family": "Ricciotti"
+      },
+      {
+        "given": "Rene A",
+        "family": "Zelaya"
+      },
+      {
+        "given": "Daniel S",
+        "family": "Himmelstein"
+      },
+      {
+        "given": "Ran",
+        "family": "Zhang"
+      },
+      {
+        "given": "Boris M",
+        "family": "Hartmann"
+      },
+      {
+        "given": "Elena",
+        "family": "Zaslavsky"
+      },
+      {
+        "given": "Stuart C",
+        "family": "Sealfon"
+      },
+      {
+        "given": "Daniel I",
+        "family": "Chasman"
+      },
+      {
+        "given": "Garret A",
+        "family": "FitzGerald"
+      },
+      {
+        "given": "Kara",
+        "family": "Dolinski"
+      },
+      {
+        "given": "Tilo",
+        "family": "Grosser"
+      },
+      {
+        "given": "Olga G",
+        "family": "Troyanskaya"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          4,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7dvkv",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4828725",
+    "PMID": "25915600",
+    "id": "CVF61Un5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3259"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "8",
+    "DOI": "10.1038/ng.3314",
+    "type": "article-journal",
+    "page": "856-860",
+    "source": "Crossref",
+    "title": "The support of human genetic evidence for approved drug indications",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Matthew R",
+        "family": "Nelson"
+      },
+      {
+        "given": "Hannah",
+        "family": "Tipney"
+      },
+      {
+        "given": "Jeffery L",
+        "family": "Painter"
+      },
+      {
+        "given": "Judong",
+        "family": "Shen"
+      },
+      {
+        "given": "Paola",
+        "family": "Nicoletti"
+      },
+      {
+        "given": "Yufeng",
+        "family": "Shen"
+      },
+      {
+        "given": "Aris",
+        "family": "Floratos"
+      },
+      {
+        "given": "Pak Chung",
+        "family": "Sham"
+      },
+      {
+        "given": "Mulin Jun",
+        "family": "Li"
+      },
+      {
+        "given": "Junwen",
+        "family": "Wang"
+      },
+      {
+        "given": "Lon R",
+        "family": "Cardon"
+      },
+      {
+        "given": "John C",
+        "family": "Whittaker"
+      },
+      {
+        "given": "Philippe",
+        "family": "Sanseau"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          6,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mn52",
+    "container-title-short": "Nat Genet",
+    "PMID": "26121088",
+    "id": "REXpV7nA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3314"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "9",
+    "DOI": "10.1038/ng.3367",
+    "type": "article-journal",
+    "page": "1091-1098",
+    "source": "Crossref",
+    "title": "A gene-based association method for mapping traits using reference transcriptome data",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Eric R",
+        "family": "Gamazon"
+      },
+      {
+        "given": "Heather E",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Kaanan P",
+        "family": "Shah"
+      },
+      {
+        "given": "Sahar V",
+        "family": "Mozaffari"
+      },
+      {
+        "given": "Keston",
+        "family": "Aquino-Michaels"
+      },
+      {
+        "given": "Robert J",
+        "family": "Carroll"
+      },
+      {
+        "given": "Anne E",
+        "family": "Eyler"
+      },
+      {
+        "given": "Joshua C",
+        "family": "Denny"
+      },
+      {
+        "given": "Dan L",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Nancy J",
+        "family": "Cox"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          8,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7p9zv",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4552594",
+    "PMID": "26258848",
+    "id": "Z8bvDdVq",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3367"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "3",
+    "DOI": "10.1038/ng.3506",
+    "type": "article-journal",
+    "page": "245-252",
+    "source": "Crossref",
+    "title": "Integrative approaches for large-scale transcriptome-wide association studies",
+    "volume": "48",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Arthur",
+        "family": "Ko"
+      },
+      {
+        "given": "Huwenbo",
+        "family": "Shi"
+      },
+      {
+        "given": "Gaurav",
+        "family": "Bhatia"
+      },
+      {
+        "given": "Wonil",
+        "family": "Chung"
+      },
+      {
+        "given": "Brenda W J H",
+        "family": "Penninx"
+      },
+      {
+        "given": "Rick",
+        "family": "Jansen"
+      },
+      {
+        "given": "Eco J C",
+        "family": "de Geus"
+      },
+      {
+        "given": "Dorret I",
+        "family": "Boomsma"
+      },
+      {
+        "given": "Fred A",
+        "family": "Wright"
+      },
+      {
+        "given": "Patrick F",
+        "family": "Sullivan"
+      },
+      {
+        "given": "Elina",
+        "family": "Nikkola"
+      },
+      {
+        "given": "Marcus",
+        "family": "Alvarez"
+      },
+      {
+        "given": "Mete",
+        "family": "Civelek"
+      },
+      {
+        "given": "Aldons J",
+        "family": "Lusis"
+      },
+      {
+        "given": "Terho",
+        "family": "Lehtimäki"
+      },
+      {
+        "given": "Emma",
+        "family": "Raitoharju"
+      },
+      {
+        "given": "Mika",
+        "family": "Kähönen"
+      },
+      {
+        "given": "Ilkka",
+        "family": "Seppälä"
+      },
+      {
+        "given": "Olli T",
+        "family": "Raitakari"
+      },
+      {
+        "given": "Johanna",
+        "family": "Kuusisto"
+      },
+      {
+        "given": "Markku",
+        "family": "Laakso"
+      },
+      {
+        "given": "Alkes L",
+        "family": "Price"
+      },
+      {
+        "given": "Päivi",
+        "family": "Pajukanta"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          2,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3vf4p",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC4767558",
+    "PMID": "26854917",
+    "id": "1D63fEEPb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3506"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7",
+    "DOI": "10.1038/ng.3570",
+    "type": "article-journal",
+    "page": "709-717",
+    "source": "Crossref",
+    "title": "Detection and interpretation of shared genetic influences on 42 human traits",
+    "volume": "48",
+    "author": [
+      {
+        "given": "Joseph K",
+        "family": "Pickrell"
+      },
+      {
+        "given": "Tomaz",
+        "family": "Berisa"
+      },
+      {
+        "given": "Jimmy Z",
+        "family": "Liu"
+      },
+      {
+        "given": "Laure",
+        "family": "Ségurel"
+      },
+      {
+        "given": "Joyce Y",
+        "family": "Tung"
+      },
+      {
+        "given": "David A",
+        "family": "Hinds"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          5,
+          16
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f8ssw4",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5207801",
+    "PMID": "27182965",
+    "id": "PDWEwciL",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3570"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "12",
+    "DOI": "10.1038/ng.3985",
+    "type": "article-journal",
+    "page": "1752-1757",
+    "source": "Crossref",
+    "title": "Shared genetic origin of asthma, hay fever and eczema elucidates allergic disease biology",
+    "volume": "49",
+    "author": [
+      {
+        "given": "Manuel A",
+        "family": "Ferreira"
+      },
+      {},
+      {
+        "given": "Judith M",
+        "family": "Vonk"
+      },
+      {
+        "given": "Hansjörg",
+        "family": "Baurecht"
+      },
+      {
+        "given": "Ingo",
+        "family": "Marenholz"
+      },
+      {
+        "given": "Chao",
+        "family": "Tian"
+      },
+      {
+        "given": "Joshua D",
+        "family": "Hoffman"
+      },
+      {
+        "given": "Quinta",
+        "family": "Helmer"
+      },
+      {
+        "given": "Annika",
+        "family": "Tillander"
+      },
+      {
+        "given": "Vilhelmina",
+        "family": "Ullemar"
+      },
+      {
+        "given": "Jenny",
+        "family": "van Dongen"
+      },
+      {
+        "given": "Yi",
+        "family": "Lu"
+      },
+      {
+        "given": "Franz",
+        "family": "Rüschendorf"
+      },
+      {
+        "given": "Jorge",
+        "family": "Esparza-Gordillo"
+      },
+      {
+        "given": "Chris W",
+        "family": "Medway"
+      },
+      {
+        "given": "Edward",
+        "family": "Mountjoy"
+      },
+      {
+        "given": "Kimberley",
+        "family": "Burrows"
+      },
+      {
+        "given": "Oliver",
+        "family": "Hummel"
+      },
+      {
+        "given": "Sarah",
+        "family": "Grosche"
+      },
+      {
+        "given": "Ben M",
+        "family": "Brumpton"
+      },
+      {
+        "given": "John S",
+        "family": "Witte"
+      },
+      {
+        "given": "Jouke-Jan",
+        "family": "Hottenga"
+      },
+      {
+        "given": "Gonneke",
+        "family": "Willemsen"
+      },
+      {
+        "given": "Jie",
+        "family": "Zheng"
+      },
+      {
+        "given": "Elke",
+        "family": "Rodríguez"
+      },
+      {
+        "given": "Melanie",
+        "family": "Hotze"
+      },
+      {
+        "given": "Andre",
+        "family": "Franke"
+      },
+      {
+        "given": "Joana A",
+        "family": "Revez"
+      },
+      {
+        "given": "Jonathan",
+        "family": "Beesley"
+      },
+      {
+        "given": "Melanie C",
+        "family": "Matheson"
+      },
+      {
+        "given": "Shyamali C",
+        "family": "Dharmage"
+      },
+      {
+        "given": "Lisa M",
+        "family": "Bain"
+      },
+      {
+        "given": "Lars G",
+        "family": "Fritsche"
+      },
+      {
+        "given": "Maiken E",
+        "family": "Gabrielsen"
+      },
+      {
+        "given": "Brunilda",
+        "family": "Balliu"
+      },
+      {
+        "given": "Jonas B",
+        "family": "Nielsen"
+      },
+      {
+        "given": "Wei",
+        "family": "Zhou"
+      },
+      {
+        "given": "Kristian",
+        "family": "Hveem"
+      },
+      {
+        "given": "Arnulf",
+        "family": "Langhammer"
+      },
+      {
+        "given": "Oddgeir L",
+        "family": "Holmen"
+      },
+      {
+        "given": "Mari",
+        "family": "Løset"
+      },
+      {
+        "given": "Gonçalo R",
+        "family": "Abecasis"
+      },
+      {
+        "given": "Cristen J",
+        "family": "Willer"
+      },
+      {
+        "given": "Andreas",
+        "family": "Arnold"
+      },
+      {
+        "given": "Georg",
+        "family": "Homuth"
+      },
+      {
+        "given": "Carsten O",
+        "family": "Schmidt"
+      },
+      {
+        "given": "Philip J",
+        "family": "Thompson"
+      },
+      {
+        "given": "Nicholas G",
+        "family": "Martin"
+      },
+      {
+        "given": "David L",
+        "family": "Duffy"
+      },
+      {
+        "given": "Natalija",
+        "family": "Novak"
+      },
+      {
+        "given": "Holger",
+        "family": "Schulz"
+      },
+      {
+        "given": "Stefan",
+        "family": "Karrasch"
+      },
+      {
+        "given": "Christian",
+        "family": "Gieger"
+      },
+      {
+        "given": "Konstantin",
+        "family": "Strauch"
+      },
+      {
+        "given": "Ronald B",
+        "family": "Melles"
+      },
+      {
+        "given": "David A",
+        "family": "Hinds"
+      },
+      {
+        "given": "Norbert",
+        "family": "Hübner"
+      },
+      {
+        "given": "Stephan",
+        "family": "Weidinger"
+      },
+      {
+        "given": "Patrik K E",
+        "family": "Magnusson"
+      },
+      {
+        "given": "Rick",
+        "family": "Jansen"
+      },
+      {
+        "given": "Eric",
+        "family": "Jorgenson"
+      },
+      {
+        "given": "Young-Ae",
+        "family": "Lee"
+      },
+      {
+        "given": "Dorret I",
+        "family": "Boomsma"
+      },
+      {
+        "given": "Catarina",
+        "family": "Almqvist"
+      },
+      {
+        "given": "Robert",
+        "family": "Karlsson"
+      },
+      {
+        "given": "Gerard H",
+        "family": "Koppelman"
+      },
+      {
+        "given": "Lavinia",
+        "family": "Paternoster"
+      },
+      {},
+      {},
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          10,
+          30
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gchg62",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5989923",
+    "PMID": "29083406",
+    "id": "LyJmyoQr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/ng.3985"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "10",
+    "DOI": "10.1038/nn.4618",
+    "type": "article-journal",
+    "page": "1342-1349",
+    "source": "Crossref",
+    "title": "Analysis of genome-wide association data highlights candidates for drug repositioning in psychiatry",
+    "volume": "20",
+    "author": [
+      {
+        "given": "Hon-Cheong",
+        "family": "So"
+      },
+      {
+        "given": "Carlos Kwan-Long",
+        "family": "Chau"
+      },
+      {
+        "given": "Wan-To",
+        "family": "Chiu"
+      },
+      {
+        "given": "Kin-Sang",
+        "family": "Ho"
+      },
+      {
+        "given": "Cho-Pong",
+        "family": "Lo"
+      },
+      {
+        "given": "Stephanie Ho-Yue",
+        "family": "Yim"
+      },
+      {
+        "given": "Pak-Chung",
+        "family": "Sham"
+      }
+    ],
+    "container-title": "Nature Neuroscience",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          8,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gbrssh",
+    "container-title-short": "Nat Neurosci",
+    "PMID": "28805813",
+    "id": "17oeJ0CXy",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/nn.4618"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03424-4",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "A comprehensive evaluation of module detection methods for gene expression data",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Wouter",
+        "family": "Saelens"
+      },
+      {
+        "given": "Robrecht",
+        "family": "Cannoodt"
+      },
+      {
+        "given": "Yvan",
+        "family": "Saeys"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          3,
+          15
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gc9x36",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5854612",
+    "PMID": "29545622",
+    "id": "1BVbSrr6M",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03424-4"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03621-1",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Exploring the phenotypic consequences of tissue specific gene expression variation inferred from GWAS summary statistics",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {},
+      {
+        "given": "Scott P.",
+        "family": "Dickinson"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Jiamao",
+        "family": "Zheng"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Jason M.",
+        "family": "Torres"
+      },
+      {
+        "given": "Eric S.",
+        "family": "Torstenson"
+      },
+      {
+        "given": "Kaanan P.",
+        "family": "Shah"
+      },
+      {
+        "given": "Tzintzuni",
+        "family": "Garcia"
+      },
+      {
+        "given": "Todd L.",
+        "family": "Edwards"
+      },
+      {
+        "given": "Eli A.",
+        "family": "Stahl"
+      },
+      {
+        "given": "Laura M.",
+        "family": "Huckins"
+      },
+      {
+        "given": "Dan L.",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Nancy J.",
+        "family": "Cox"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdjvp5",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5940825",
+    "PMID": "29739930",
+    "id": "vLyTudUB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03621-1"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-03751-6",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Massive mining of publicly available RNA-seq data from human and mouse",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Lachmann"
+      },
+      {
+        "given": "Denis",
+        "family": "Torre"
+      },
+      {
+        "given": "Alexandra B.",
+        "family": "Keenan"
+      },
+      {
+        "given": "Kathleen M.",
+        "family": "Jagodnik"
+      },
+      {
+        "given": "Hoyjin J.",
+        "family": "Lee"
+      },
+      {
+        "given": "Lily",
+        "family": "Wang"
+      },
+      {
+        "given": "Moshe C.",
+        "family": "Silverstein"
+      },
+      {
+        "given": "Avi",
+        "family": "Ma’ayan"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gc92dr",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC5893633",
+    "PMID": "29636450",
+    "id": "Nz3IMEzd",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-03751-6"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/s41467-018-06022-6",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Conditional and interaction gene-set analysis reveals novel functional pathways for blood pressure",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Christiaan A.",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Sven",
+        "family": "Stringer"
+      },
+      {
+        "given": "Ilona A.",
+        "family": "Dekkers"
+      },
+      {
+        "given": "Tom",
+        "family": "Heskes"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          9,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gd6d85",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC6138636",
+    "PMID": "30218068",
+    "id": "Om8ZhS06",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-018-06022-6"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Millions of transcriptomic profiles have been deposited in public archives, yet remain underused for the interpretation of new experiments. We present a method for interpreting new transcriptomic datasets through instant comparison to public datasets without high-performance computing requirements. We apply Principal Component Analysis on 536 studies comprising 44,890 human RNA sequencing profiles and aggregate sufficiently similar loading vectors to form Replicable Axes of Variation (RAV). RAVs are annotated with metadata of originating studies and by gene set enrichment analysis. Functionality to associate new datasets with RAVs, extract interpretable annotations, and provide intuitive visualization are implemented as the GenomicSuperSignature R/Bioconductor package. We demonstrate the efficient and coherent database search, robustness to batch effects and heterogeneous training data, and transfer learning capacity of our method using TCGA and rare diseases datasets. GenomicSuperSignature aids in analyzing new gene expression data in the context of existing databases using minimal computing resources.</jats:p>",
+    "DOI": "10.1038/s41467-022-31411-3",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "GenomicSuperSignature facilitates interpretation of RNA-seq experiments through robust, efficient comparison to public databases",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Sehyun",
+        "family": "Oh"
+      },
+      {
+        "given": "Ludwig",
+        "family": "Geistlinger"
+      },
+      {
+        "given": "Marcel",
+        "family": "Ramos"
+      },
+      {
+        "given": "Daniel",
+        "family": "Blankenberg"
+      },
+      {
+        "given": "Marius",
+        "family": "van den Beek"
+      },
+      {
+        "given": "Jaclyn N.",
+        "family": "Taroni"
+      },
+      {
+        "given": "Vincent J.",
+        "family": "Carey"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Levi",
+        "family": "Waldron"
+      },
+      {
+        "given": "Sean",
+        "family": "Davis"
+      }
+    ],
+    "container-title": "Nature Communications",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2022,
+          6,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqd7hm",
+    "container-title-short": "Nat Commun",
+    "PMCID": "PMC9237024",
+    "PMID": "35760813",
+    "id": "X4fhSCkz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41467-022-31411-3"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "3",
+    "DOI": "10.1038/s41576-019-0200-9",
+    "type": "article-journal",
+    "page": "137-150",
+    "source": "Crossref",
+    "title": "Mechanisms of tissue and cell-type specificity in heritable traits and diseases",
+    "volume": "21",
+    "author": [
+      {
+        "given": "Idan",
+        "family": "Hekselman"
+      },
+      {
+        "given": "Esti",
+        "family": "Yeger-Lotem"
+      }
+    ],
+    "container-title": "Nature Reviews Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          1,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggkx9v",
+    "container-title-short": "Nat Rev Genet",
+    "PMID": "31913361",
+    "id": "nhaocxmR",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41576-019-0200-9"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7726",
+    "DOI": "10.1038/s41586-018-0579-z",
+    "type": "article-journal",
+    "page": "203-209",
+    "source": "Crossref",
+    "title": "The UK Biobank resource with deep phenotyping and genomic data",
+    "volume": "562",
+    "author": [
+      {
+        "given": "Clare",
+        "family": "Bycroft"
+      },
+      {
+        "given": "Colin",
+        "family": "Freeman"
+      },
+      {
+        "given": "Desislava",
+        "family": "Petkova"
+      },
+      {
+        "given": "Gavin",
+        "family": "Band"
+      },
+      {
+        "given": "Lloyd T.",
+        "family": "Elliott"
+      },
+      {
+        "given": "Kevin",
+        "family": "Sharp"
+      },
+      {
+        "given": "Allan",
+        "family": "Motyer"
+      },
+      {
+        "given": "Damjan",
+        "family": "Vukcevic"
+      },
+      {
+        "given": "Olivier",
+        "family": "Delaneau"
+      },
+      {
+        "given": "Jared",
+        "family": "O’Connell"
+      },
+      {
+        "given": "Adrian",
+        "family": "Cortes"
+      },
+      {
+        "given": "Samantha",
+        "family": "Welsh"
+      },
+      {
+        "given": "Alan",
+        "family": "Young"
+      },
+      {
+        "given": "Mark",
+        "family": "Effingham"
+      },
+      {
+        "given": "Gil",
+        "family": "McVean"
+      },
+      {
+        "given": "Stephen",
+        "family": "Leslie"
+      },
+      {
+        "given": "Naomi",
+        "family": "Allen"
+      },
+      {
+        "given": "Peter",
+        "family": "Donnelly"
+      },
+      {
+        "given": "Jonathan",
+        "family": "Marchini"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          10
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfb7h2",
+    "container-title-short": "Nature",
+    "PMCID": "PMC6786975",
+    "PMID": "30305743",
+    "id": "nmJxPpE5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-018-0579-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7845",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Annotating the molecular basis of human disease remains an unsolved challenge, as 93% of disease loci are non-coding and gene-regulatory annotations are highly incomplete<jats:sup>1–3</jats:sup>. Here we present EpiMap, a compendium comprising 10,000 epigenomic maps across 800 samples, which we used to define chromatin states, high-resolution enhancers, enhancer modules, upstream regulators and downstream target genes. We used this resource to annotate 30,000 genetic loci that were associated with 540 traits<jats:sup>4</jats:sup>, predicting trait-relevant tissues, putative causal nucleotide variants in enriched tissue enhancers and candidate tissue-specific target genes for each. We partitioned multifactorial traits into tissue-specific contributing factors with distinct functional enrichments and disease comorbidity patterns, and revealed both single-factor monotropic and multifactor pleiotropic loci. Top-scoring loci frequently had multiple predicted driver variants, converging through multiple enhancers with a common target gene, multiple genes in common tissues, or multiple genes and multiple tissues, indicating extensive pleiotropy. Our results demonstrate the importance of dense, rich, high-resolution epigenomic annotations for the investigation of complex traits.</jats:p>",
+    "DOI": "10.1038/s41586-020-03145-z",
+    "type": "article-journal",
+    "page": "300-307",
+    "source": "Crossref",
+    "title": "Regulatory genomic circuitry of human disease loci by integrative epigenomics",
+    "volume": "590",
+    "author": [
+      {
+        "given": "Carles A.",
+        "family": "Boix"
+      },
+      {
+        "given": "Benjamin T.",
+        "family": "James"
+      },
+      {
+        "given": "Yongjin P.",
+        "family": "Park"
+      },
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          2,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghzkhr",
+    "container-title-short": "Nature",
+    "PMCID": "PMC7875769",
+    "PMID": "33536621",
+    "id": "xRGqPsT2",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-020-03145-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7820",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>DNase I hypersensitive sites (DHSs) are generic markers of regulatory DNA<jats:sup>1–5</jats:sup>and contain genetic variations associated with diseases and phenotypic traits<jats:sup>6–8</jats:sup>. We created high-resolution maps of DHSs from 733 human biosamples encompassing 438 cell and tissue types and states, and integrated these to delineate and numerically index approximately 3.6 million DHSs within the human genome sequence, providing a common coordinate system for regulatory DNA. Here we show that these maps highly resolve the<jats:italic>cis</jats:italic>-regulatory compartment of the human genome, which encodes unexpectedly diverse cell- and tissue-selective regulatory programs at very high density. These programs can be captured comprehensively by a simple vocabulary that enables the assignment to each DHS of a regulatory barcode that encapsulates its tissue manifestations, and global annotation of protein-coding and non-coding RNA genes in a manner orthogonal to gene expression. Finally, we show that sharply resolved DHSs markedly enhance the genetic association and heritability signals of diseases and traits. Rather than being confined to a small number of distal elements or promoters, we find that genetic signals converge on congruently regulated sets of DHSs that decorate entire gene bodies. Together, our results create a universal, extensible coordinate system and vocabulary for human regulatory DNA marked by DHSs, and provide a new global perspective on the architecture of human gene regulation.</jats:p>",
+    "DOI": "10.1038/s41586-020-2559-3",
+    "type": "article-journal",
+    "page": "244-251",
+    "source": "Crossref",
+    "title": "Index and biological spectrum of human DNase I hypersensitive sites",
+    "volume": "584",
+    "author": [
+      {
+        "given": "Wouter",
+        "family": "Meuleman"
+      },
+      {
+        "given": "Alexander",
+        "family": "Muratov"
+      },
+      {
+        "given": "Eric",
+        "family": "Rynes"
+      },
+      {
+        "given": "Jessica",
+        "family": "Halow"
+      },
+      {
+        "given": "Kristen",
+        "family": "Lee"
+      },
+      {
+        "given": "Daniel",
+        "family": "Bates"
+      },
+      {
+        "given": "Morgan",
+        "family": "Diegel"
+      },
+      {
+        "given": "Douglas",
+        "family": "Dunn"
+      },
+      {
+        "given": "Fidencio",
+        "family": "Neri"
+      },
+      {
+        "given": "Athanasios",
+        "family": "Teodosiadis"
+      },
+      {
+        "given": "Alex",
+        "family": "Reynolds"
+      },
+      {
+        "given": "Eric",
+        "family": "Haugen"
+      },
+      {
+        "given": "Jemma",
+        "family": "Nelson"
+      },
+      {
+        "given": "Audra",
+        "family": "Johnson"
+      },
+      {
+        "given": "Mark",
+        "family": "Frerker"
+      },
+      {
+        "given": "Michael",
+        "family": "Buckley"
+      },
+      {
+        "given": "Richard",
+        "family": "Sandstrom"
+      },
+      {
+        "given": "Jeff",
+        "family": "Vierstra"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "John",
+        "family": "Stamatoyannopoulos"
+      }
+    ],
+    "container-title": "Nature",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          7,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gg6dhp",
+    "container-title-short": "Nature",
+    "PMCID": "PMC7422677",
+    "PMID": "32728217",
+    "id": "1DoyZS7y0",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41586-020-2559-3"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-018-0081-4",
+    "type": "article-journal",
+    "page": "621-629",
+    "source": "Crossref",
+    "title": "Heritability enrichment of specifically expressed genes identifies disease-relevant tissues and cell types",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Hilary K.",
+        "family": "Finucane"
+      },
+      {
+        "given": "Yakir A.",
+        "family": "Reshef"
+      },
+      {
+        "given": "Verneri",
+        "family": "Anttila"
+      },
+      {
+        "given": "Kamil",
+        "family": "Slowikowski"
+      },
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Andrea",
+        "family": "Byrnes"
+      },
+      {
+        "given": "Steven",
+        "family": "Gazal"
+      },
+      {
+        "given": "Po-Ru",
+        "family": "Loh"
+      },
+      {
+        "given": "Caleb",
+        "family": "Lareau"
+      },
+      {
+        "given": "Noam",
+        "family": "Shoresh"
+      },
+      {
+        "given": "Giulio",
+        "family": "Genovese"
+      },
+      {
+        "given": "Arpiar",
+        "family": "Saunders"
+      },
+      {
+        "given": "Evan",
+        "family": "Macosko"
+      },
+      {
+        "given": "Samuela",
+        "family": "Pollack"
+      },
+      {
+        "given": "John R. B.",
+        "family": "Perry"
+      },
+      {
+        "given": "Jason D.",
+        "family": "Buenrostro"
+      },
+      {
+        "given": "Bradley E.",
+        "family": "Bernstein"
+      },
+      {
+        "given": "Soumya",
+        "family": "Raychaudhuri"
+      },
+      {
+        "given": "Steven",
+        "family": "McCarroll"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Alkes L.",
+        "family": "Price"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdfjqt",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5896795",
+    "PMID": "29632380",
+    "id": "WFslDIWl",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0081-4"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-018-0092-1",
+    "type": "article-journal",
+    "page": "538-548",
+    "source": "Crossref",
+    "title": "Transcriptome-wide association study of schizophrenia and chromatin activity yields mechanistic disease insights",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Alexander",
+        "family": "Gusev"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Mancuso"
+      },
+      {
+        "given": "Hyejung",
+        "family": "Won"
+      },
+      {
+        "given": "Maria",
+        "family": "Kousi"
+      },
+      {
+        "given": "Hilary K.",
+        "family": "Finucane"
+      },
+      {
+        "given": "Yakir",
+        "family": "Reshef"
+      },
+      {
+        "given": "Lingyun",
+        "family": "Song"
+      },
+      {
+        "given": "Alexias",
+        "family": "Safi"
+      },
+      {
+        "given": "Steven",
+        "family": "McCarroll"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Roel A.",
+        "family": "Ophoff"
+      },
+      {
+        "given": "Michael C.",
+        "family": "O’Donovan"
+      },
+      {
+        "given": "Gregory E.",
+        "family": "Crawford"
+      },
+      {
+        "given": "Daniel H.",
+        "family": "Geschwind"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Katsanis"
+      },
+      {
+        "given": "Patrick F.",
+        "family": "Sullivan"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      },
+      {
+        "given": "Alkes L.",
+        "family": "Price"
+      },
+      {}
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdfdf2",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5942893",
+    "PMID": "29632383",
+    "id": "AxVJwanp",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0092-1"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "6",
+    "DOI": "10.1038/s41588-018-0121-0",
+    "type": "article-journal",
+    "page": "857-864",
+    "source": "Crossref",
+    "title": "A genome-wide cross-trait analysis from UK Biobank highlights the shared genetic architecture of asthma and allergic diseases",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Zhaozhong",
+        "family": "Zhu"
+      },
+      {
+        "given": "Phil H.",
+        "family": "Lee"
+      },
+      {
+        "given": "Mark D.",
+        "family": "Chaffin"
+      },
+      {
+        "given": "Wonil",
+        "family": "Chung"
+      },
+      {
+        "given": "Po-Ru",
+        "family": "Loh"
+      },
+      {
+        "given": "Quan",
+        "family": "Lu"
+      },
+      {
+        "given": "David C.",
+        "family": "Christiani"
+      },
+      {
+        "given": "Liming",
+        "family": "Liang"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          5,
+          21
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gdpmtn",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC5980765",
+    "PMID": "29785011",
+    "id": "veADXImD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-018-0121-0"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "4",
+    "DOI": "10.1038/s41588-019-0385-z",
+    "type": "article-journal",
+    "page": "592-599",
+    "source": "Crossref",
+    "title": "Opportunities and challenges for transcriptome-wide association studies",
+    "volume": "51",
+    "author": [
+      {
+        "given": "Michael",
+        "family": "Wainberg"
+      },
+      {
+        "given": "Nasa",
+        "family": "Sinnott-Armstrong"
+      },
+      {
+        "given": "Nicholas",
+        "family": "Mancuso"
+      },
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "David A.",
+        "family": "Knowles"
+      },
+      {
+        "given": "David",
+        "family": "Golan"
+      },
+      {
+        "given": "Raili",
+        "family": "Ermel"
+      },
+      {
+        "given": "Arno",
+        "family": "Ruusalepp"
+      },
+      {
+        "given": "Thomas",
+        "family": "Quertermous"
+      },
+      {
+        "given": "Ke",
+        "family": "Hao"
+      },
+      {
+        "given": "Johan L. M.",
+        "family": "Björkegren"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Bogdan",
+        "family": "Pasaniuc"
+      },
+      {
+        "given": "Manuel A.",
+        "family": "Rivas"
+      },
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          3,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf3hmr",
+    "container-title-short": "Nat Genet",
+    "PMCID": "PMC6777347",
+    "PMID": "30926968",
+    "id": "l6ogswV3",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-019-0385-z"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "9",
+    "DOI": "10.1038/s41588-019-0481-0",
+    "type": "article-journal",
+    "page": "1339-1348",
+    "source": "Crossref",
+    "title": "A global overview of pleiotropy and genetic architecture in complex traits",
+    "volume": "51",
+    "author": [
+      {
+        "given": "Kyoko",
+        "family": "Watanabe"
+      },
+      {
+        "given": "Sven",
+        "family": "Stringer"
+      },
+      {
+        "given": "Oleksandr",
+        "family": "Frei"
+      },
+      {
+        "given": "Maša",
+        "family": "Umićević Mirkov"
+      },
+      {
+        "given": "Christiaan",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Tinca J. C.",
+        "family": "Polderman"
+      },
+      {
+        "given": "Sophie",
+        "family": "van der Sluis"
+      },
+      {
+        "given": "Ole A.",
+        "family": "Andreassen"
+      },
+      {
+        "given": "Benjamin M.",
+        "family": "Neale"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "Nature Genetics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          8,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggr84r",
+    "container-title-short": "Nat Genet",
+    "PMID": "31427789",
+    "id": "pZZn28he",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41588-019-0481-0"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "7",
+    "DOI": "10.1038/s41592-019-0456-1",
+    "type": "article-journal",
+    "page": "607-610",
+    "source": "Crossref",
+    "title": "Pathway-level information extractor (PLIER) for gene expression data",
+    "volume": "16",
+    "author": [
+      {
+        "given": "Weiguang",
+        "family": "Mao"
+      },
+      {
+        "given": "Elena",
+        "family": "Zaslavsky"
+      },
+      {
+        "given": "Boris M.",
+        "family": "Hartmann"
+      },
+      {
+        "given": "Stuart C.",
+        "family": "Sealfon"
+      },
+      {
+        "given": "Maria",
+        "family": "Chikina"
+      }
+    ],
+    "container-title": "Nature Methods",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          6,
+          27
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf75g6",
+    "container-title-short": "Nat Methods",
+    "PMCID": "PMC7262669",
+    "PMID": "31249421",
+    "id": "Ki2ij7zE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/s41592-019-0456-1"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "12",
+    "DOI": "10.1038/sj.jid.5700586",
+    "type": "article-journal",
+    "page": "2637-2646",
+    "source": "Crossref",
+    "title": "Langerhans Cells Release Prostaglandin D2 in Response to Nicotinic Acid",
+    "volume": "126",
+    "author": [
+      {
+        "given": "Dominique",
+        "family": "Maciejewski-Lenoir"
+      },
+      {
+        "given": "Jeremy G.",
+        "family": "Richman"
+      },
+      {
+        "given": "Yaron",
+        "family": "Hakak"
+      },
+      {
+        "given": "Ibragim",
+        "family": "Gaidarov"
+      },
+      {
+        "given": "Dominic P.",
+        "family": "Behan"
+      },
+      {
+        "given": "Daniel T.",
+        "family": "Connolly"
+      }
+    ],
+    "container-title": "Journal of Investigative Dermatology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dgxg75",
+    "container-title-short": "Journal of Investigative Dermatology",
+    "PMID": "17008871",
+    "id": "wI0IjT3i",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/sj.jid.5700586"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1038/srep16882",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Mycobacterial infection induces a specific human innate immune response",
+    "volume": "5",
+    "author": [
+      {
+        "given": "John D.",
+        "family": "Blischak"
+      },
+      {
+        "given": "Ludovic",
+        "family": "Tailleux"
+      },
+      {
+        "given": "Amy",
+        "family": "Mitrano"
+      },
+      {
+        "given": "Luis B.",
+        "family": "Barreiro"
+      },
+      {
+        "given": "Yoav",
+        "family": "Gilad"
+      }
+    ],
+    "container-title": "Scientific Reports",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          11,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7zk5c",
+    "container-title-short": "Sci Rep",
+    "PMCID": "PMC4653619",
+    "PMID": "26586179",
+    "id": "1kgcHkGm",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1038/srep16882"
+  },
+  {
+    "publisher": "CSIRO Publishing",
+    "issue": "7",
+    "abstract": "<jats:p>\nPolycystic ovary syndrome (PCOS) is one of the most common ovarian diseases among women of reproductive age. The reproductive and metabolic traits of PCOS are underpinned by adipocyte dysfunction, especially diminished adiponectin secretion. Based on evidence that niacin stimulates adiponectin secretion, this study evaluated the effects of niacin on adiponectin concentrations and reproductive traits in a rat model of PCOS. PCOS was induced by single injection of 4mg kg−1 oestradiol valerate (i.m.), and PCOS groups were administered orally with saline or niacin (10 or 25mg kg−1) daily for 30 days after PCOS induction. The control group received 0.2mL sesame oil (i.m.) only. At the end of the experimental period, serum samples and ovaries were collected for adiponectin, histological and molecular analyses. Niacin reduced the bodyweight gain and increased ovary weights in PCOS rats. Niacin also increased the number of normal antral follicles and corpora lutea while reducing the number of cystic follicles and the thickness of theca interna. Moreover, niacin significantly increased serum adiponectin concentration and the gene expression of adiponectin and its type 1 receptor. In conclusion, this study indicates that niacin reduces cystic follicles and improves ovulation in PCOS rats. Adiponectin signalling may have contributed, in part, to the beneficial effects.\n</jats:p>",
+    "DOI": "10.1071/rd20306",
+    "type": "article-journal",
+    "page": "447",
+    "source": "Crossref",
+    "title": "Chronic niacin administration ameliorates ovulation, histological changes in the ovary and adiponectin concentrations in a rat model of polycystic ovary syndrome",
+    "volume": "33",
+    "author": [
+      {
+        "given": "Negin",
+        "family": "Asadi"
+      },
+      {
+        "given": "Mahin",
+        "family": "Izadi"
+      },
+      {
+        "given": "Ali",
+        "family": "Aflatounian"
+      },
+      {
+        "given": "Mansour",
+        "family": "Esmaeili-Dehaj"
+      },
+      {
+        "given": "Mohammad Ebrahim",
+        "family": "Rezvani"
+      },
+      {
+        "given": "Zeinab",
+        "family": "Hafizi"
+      }
+    ],
+    "container-title": "Reproduction, Fertility and Development",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2021
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpjkt",
+    "container-title-short": "Reprod. Fertil. Dev.",
+    "PMID": "33751926",
+    "id": "TovvsrDr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1071/rd20306"
+  },
+  {
+    "publisher": "Proceedings of the National Academy of Sciences",
+    "issue": "52",
+    "abstract": "<jats:p>Heritable diseases are caused by germ-line mutations that, despite tissuewide presence, often lead to tissue-specific pathology. Here, we make a systematic analysis of the link between tissue-specific gene expression and pathological manifestations in many human diseases and cancers. Diseases were systematically mapped to tissues they affect from disease-relevant literature in PubMed to create a disease–tissue covariation matrix of high-confidence associations of &gt;1,000 diseases to 73 tissues. By retrieving &gt;2,000 known disease genes, and generating 1,500 disease-associated protein complexes, we analyzed the differential expression of a gene or complex involved in a particular disease in the tissues affected by the disease, compared with nonaffected tissues. When this analysis is scaled to all diseases in our dataset, there is a significant tendency for disease genes and complexes to be overexpressed in the normal tissues where defects cause pathology. In contrast, cancer genes and complexes were not overexpressed in the tissues from which the tumors emanate. We specifically identified a complex involved in XY sex reversal that is testis-specific and down-regulated in ovaries. We also identified complexes in Parkinson disease, cardiomyopathies, and muscular dystrophy syndromes that are similarly tissue specific. Our method represents a conceptual scaffold for organism-spanning analyses and reveals an extensive list of tissue-specific draft molecular pathways, both known and unexpected, that might be disrupted in disease.</jats:p>",
+    "DOI": "10.1073/pnas.0810772105",
+    "type": "article-journal",
+    "page": "20870-20875",
+    "source": "Crossref",
+    "title": "A large-scale analysis of tissue-specific pathology and gene expression of human disease genes and complexes",
+    "volume": "105",
+    "author": [
+      {
+        "given": "Kasper",
+        "family": "Lage"
+      },
+      {
+        "given": "Niclas Tue",
+        "family": "Hansen"
+      },
+      {
+        "given": "E. Olof",
+        "family": "Karlberg"
+      },
+      {
+        "given": "Aron C.",
+        "family": "Eklund"
+      },
+      {
+        "given": "Francisco S.",
+        "family": "Roque"
+      },
+      {
+        "given": "Patricia K.",
+        "family": "Donahoe"
+      },
+      {
+        "given": "Zoltan",
+        "family": "Szallasi"
+      },
+      {
+        "given": "Thomas Skøt",
+        "family": "Jensen"
+      },
+      {
+        "given": "Søren",
+        "family": "Brunak"
+      }
+    ],
+    "container-title": "Proceedings of the National Academy of Sciences",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          12,
+          30
+        ]
+      ]
+    },
+    "URL": "https://doi.org/d5qcv9",
+    "container-title-short": "Proc. Natl. Acad. Sci. U.S.A.",
+    "PMCID": "PMC2606902",
+    "PMID": "19104045",
+    "id": "wNE0EQlN",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1073/pnas.0810772105"
+  },
+  {
+    "publisher": "IOP Publishing",
+    "DOI": "10.1088/1755-1315/31/1/012012",
+    "type": "article-journal",
+    "page": "012012",
+    "source": "Crossref",
+    "title": "Determination of Optimal Epsilon (Eps) Value on DBSCAN Algorithm to Clustering Data on Peatland Hotspots in Sumatra",
+    "volume": "31",
+    "author": [
+      {
+        "given": "Nadia",
+        "family": "Rahmah"
+      },
+      {
+        "given": "Imas Sukaesih",
+        "family": "Sitanggang"
+      }
+    ],
+    "container-title": "IOP Conference Series: Earth and Environmental Science",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqr7z2",
+    "container-title-short": "IOP Conf. Ser.: Earth Environ. Sci.",
+    "id": "FB7XPWl6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1088/1755-1315/31/1/012012"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "8",
+    "DOI": "10.1093/bioinformatics/btq099",
+    "type": "article-journal",
+    "page": "1112-1118",
+    "source": "Crossref",
+    "title": "Modeling sample variables with an Experimental Factor Ontology",
+    "volume": "26",
+    "author": [
+      {
+        "given": "James",
+        "family": "Malone"
+      },
+      {
+        "given": "Ele",
+        "family": "Holloway"
+      },
+      {
+        "given": "Tomasz",
+        "family": "Adamusiak"
+      },
+      {
+        "given": "Misha",
+        "family": "Kapushesky"
+      },
+      {
+        "given": "Jie",
+        "family": "Zheng"
+      },
+      {
+        "given": "Nikolay",
+        "family": "Kolesnikov"
+      },
+      {
+        "given": "Anna",
+        "family": "Zhukova"
+      },
+      {
+        "given": "Alvis",
+        "family": "Brazma"
+      },
+      {
+        "given": "Helen",
+        "family": "Parkinson"
+      }
+    ],
+    "container-title": "Bioinformatics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          3,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dsb6vt",
+    "PMCID": "PMC2853691",
+    "PMID": "20200009",
+    "id": "9okjVu3s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/bioinformatics/btq099"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "7",
+    "DOI": "10.1093/gigascience/giy083",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Clustering trees: a visualization for evaluating clusterings at multiple resolutions",
+    "volume": "7",
+    "author": [
+      {
+        "given": "Luke",
+        "family": "Zappia"
+      },
+      {
+        "given": "Alicia",
+        "family": "Oshlack"
+      }
+    ],
+    "container-title": "GigaScience",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          7,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfzqf5",
+    "PMCID": "PMC6057528",
+    "PMID": "30010766",
+    "id": "xhtEAzx6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/gigascience/giy083"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "D1",
+    "DOI": "10.1093/nar/gkt1068",
+    "type": "article-journal",
+    "page": "D1091-D1097",
+    "source": "Crossref",
+    "title": "DrugBank 4.0: shedding new light on drug metabolism",
+    "volume": "42",
+    "author": [
+      {
+        "given": "Vivian",
+        "family": "Law"
+      },
+      {
+        "given": "Craig",
+        "family": "Knox"
+      },
+      {
+        "given": "Yannick",
+        "family": "Djoumbou"
+      },
+      {
+        "given": "Tim",
+        "family": "Jewison"
+      },
+      {
+        "given": "An Chi",
+        "family": "Guo"
+      },
+      {
+        "given": "Yifeng",
+        "family": "Liu"
+      },
+      {
+        "given": "Adam",
+        "family": "Maciejewski"
+      },
+      {
+        "given": "David",
+        "family": "Arndt"
+      },
+      {
+        "given": "Michael",
+        "family": "Wilson"
+      },
+      {
+        "given": "Vanessa",
+        "family": "Neveu"
+      },
+      {
+        "given": "Alexandra",
+        "family": "Tang"
+      },
+      {
+        "given": "Geraldine",
+        "family": "Gabriel"
+      },
+      {
+        "given": "Carol",
+        "family": "Ly"
+      },
+      {
+        "given": "Sakina",
+        "family": "Adamjee"
+      },
+      {
+        "given": "Zerihun T.",
+        "family": "Dame"
+      },
+      {
+        "given": "Beomsoo",
+        "family": "Han"
+      },
+      {
+        "given": "You",
+        "family": "Zhou"
+      },
+      {
+        "given": "David S.",
+        "family": "Wishart"
+      }
+    ],
+    "container-title": "Nucleic Acids Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          11,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f3mn6d",
+    "container-title-short": "Nucl. Acids Res.",
+    "PMCID": "PMC3965102",
+    "PMID": "24203711",
+    "id": "6PR8LEXK",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/nar/gkt1068"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "D1",
+    "DOI": "10.1093/nar/gky1032",
+    "type": "article-journal",
+    "page": "D955-D962",
+    "source": "Crossref",
+    "title": "Human Disease Ontology 2018 update: classification, content and workflow expansion",
+    "volume": "47",
+    "author": [
+      {
+        "given": "Lynn M",
+        "family": "Schriml"
+      },
+      {
+        "given": "Elvira",
+        "family": "Mitraka"
+      },
+      {
+        "given": "James",
+        "family": "Munro"
+      },
+      {
+        "given": "Becky",
+        "family": "Tauber"
+      },
+      {
+        "given": "Mike",
+        "family": "Schor"
+      },
+      {
+        "given": "Lance",
+        "family": "Nickle"
+      },
+      {
+        "given": "Victor",
+        "family": "Felix"
+      },
+      {
+        "given": "Linda",
+        "family": "Jeng"
+      },
+      {
+        "given": "Cynthia",
+        "family": "Bearer"
+      },
+      {
+        "given": "Richard",
+        "family": "Lichenstein"
+      },
+      {
+        "given": "Katharine",
+        "family": "Bisordi"
+      },
+      {
+        "given": "Nicole",
+        "family": "Campion"
+      },
+      {
+        "given": "Brooke",
+        "family": "Hyman"
+      },
+      {
+        "given": "David",
+        "family": "Kurland"
+      },
+      {
+        "given": "Connor Patrick",
+        "family": "Oates"
+      },
+      {
+        "given": "Siobhan",
+        "family": "Kibbey"
+      },
+      {
+        "given": "Poorna",
+        "family": "Sreekumar"
+      },
+      {
+        "given": "Chris",
+        "family": "Le"
+      },
+      {
+        "given": "Michelle",
+        "family": "Giglio"
+      },
+      {
+        "given": "Carol",
+        "family": "Greene"
+      }
+    ],
+    "container-title": "Nucleic Acids Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          11,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggx9wp",
+    "PMCID": "PMC6323977",
+    "PMID": "30407550",
+    "id": "1FsruosUW",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/nar/gky1032"
+  },
+  {
+    "publisher": "Oxford University Press (OUP)",
+    "issue": "12",
+    "DOI": "10.1093/qjmed/hci136",
+    "type": "article-journal",
+    "page": "845-856",
+    "source": "Crossref",
+    "title": "Reverse cholesterol transport and cholesterol efflux in atherosclerosis",
+    "volume": "98",
+    "author": [
+      {
+        "given": "R.",
+        "family": "Ohashi"
+      },
+      {
+        "given": "H.",
+        "family": "Mu"
+      },
+      {
+        "given": "X.",
+        "family": "Wang"
+      },
+      {
+        "given": "Q.",
+        "family": "Yao"
+      },
+      {
+        "given": "C.",
+        "family": "Chen"
+      }
+    ],
+    "container-title": "QJM: An International Journal of Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          10,
+          28
+        ]
+      ]
+    },
+    "URL": "https://doi.org/dn2fgt",
+    "PMID": "16258026",
+    "id": "idlBgtFz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1093/qjmed/hci136"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Gene set enrichment analysis (GSEA) is an ubiquitously used tool for evaluating pathway enrichment in transcriptional data. Typical experimental design consists in comparing two conditions with several replicates using a differential gene expression test followed by preranked GSEA performed against a collection of hundreds and thousands of pathways. However, the reference implementation of this method cannot accurately estimate small P-values, which significantly limits its sensitivity due to multiple hypotheses correction procedure.</jats:p><jats:p>Here we present FGSEA (Fast Gene Set Enrichment Analysis) method that is able to estimate arbitrarily low GSEA P-values with a high accuracy in a matter of minutes or even seconds. To confirm the accuracy of the method, we also developed an exact algorithm for GSEA P-values calculation for integer gene-level statistics. Using the exact algorithm as a reference we show that FGSEA is able to routinely estimate P-values up to 10<jats:sup>−100</jats:sup> with a small and predictable estimation error. We systematically evaluate FGSEA on a collection of 605 datasets and show that FGSEA recovers much more statistically significant pathways compared to other implementations.</jats:p><jats:p>FGSEA is open source and available as an R package in Bioconductor (<jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"http://bioconductor.org/packages/fgsea/\">http://bioconductor.org/packages/fgsea/</jats:ext-link>) and on GitHub (<jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"https://github.com/ctlab/fgsea/\">https://github.com/ctlab/fgsea/</jats:ext-link>).</jats:p>",
+    "DOI": "10.1101/060012",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Fast gene set enrichment analysis",
+    "author": [
+      {
+        "given": "Gennady",
+        "family": "Korotkevich"
+      },
+      {
+        "given": "Vladimir",
+        "family": "Sukhov"
+      },
+      {
+        "given": "Nikolay",
+        "family": "Budin"
+      },
+      {
+        "given": "Boris",
+        "family": "Shpak"
+      },
+      {
+        "given": "Maxim N.",
+        "family": "Artyomov"
+      },
+      {
+        "given": "Alexey",
+        "family": "Sergushichev"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfpqhm",
+    "id": "Z8WXLD67",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/060012"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>There are currently &gt;1.3 million human –omics samples that are publicly available. This valuable resource remains acutely underused because discovering particular samples from this ever-growing data collection remains a significant challenge. The major impediment is that sample attributes are routinely described using varied terminologies written in unstructured natural language. We propose a natural-language-processing-based machine learning approach (NLP-ML) to infer tissue and cell-type annotations for –omics samples based only on their free-text metadata. NLP-ML works by creating numerical representations of sample descriptions and using these representations as features in a supervised learning classifier that predicts tissue/cell-type terms. Our approach significantly outperforms an advanced graph-based reasoning annotation method (MetaSRA) and a baseline exact string matching method (TAGGER). Model similarities between related tissues demonstrate that NLP-ML models capture biologically-meaningful signals in text. Additionally, these models correctly classify tissue-associated biological processes and diseases based on their text descriptions alone. NLP-ML models are nearly as accurate as models based on gene-expression profiles in predicting sample tissue annotations but have the distinct capability to classify samples irrespective of the –omics experiment type based on their text metadata. Python NLP-ML prediction code and trained tissue models are available at <jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"https://github.com/krishnanlab/txt2onto\">https://github.com/krishnanlab/txt2onto</jats:ext-link>.</jats:p>",
+    "DOI": "10.1101/2021.05.10.443525",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Systematic tissue annotations of –omics samples by modeling unstructured metadata",
+    "author": [
+      {
+        "given": "Nathaniel T.",
+        "family": "Hawkins"
+      },
+      {
+        "given": "Marc",
+        "family": "Maldaver"
+      },
+      {
+        "given": "Anna",
+        "family": "Yannakopoulos"
+      },
+      {
+        "given": "Lindsay A.",
+        "family": "Guare"
+      },
+      {
+        "given": "Arjun",
+        "family": "Krishnan"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          5,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj2pkc",
+    "id": "fnDaLjFy",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.05.10.443525"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>ABSTRACT</jats:title><jats:p>We present recount3, a resource consisting of over 750,000 publicly available human and mouse RNA sequencing (RNA-seq) samples uniformly processed by our new <jats:monospace>Monorail</jats:monospace> analysis pipeline. To facilitate access to the data, we provide the <jats:monospace>recount3</jats:monospace> and <jats:monospace>snapcount</jats:monospace> R/Bioconductor packages as well as complementary web resources. Using these tools, data can be downloaded as study-level summaries or queried for specific exon-exon junctions, genes, samples, or other features. <jats:monospace>Monorail</jats:monospace> can be used to process local and/or private data, allowing results to be directly compared to any study in recount3. Taken together, our tools help biologists maximize the utility of publicly available RNA-seq data, especially to improve their understanding of newly collected data. recount3 is available from <jats:ext-link xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"uri\" xlink:href=\"http://rna.recount.bio\">http://rna.recount.bio</jats:ext-link>.</jats:p>",
+    "DOI": "10.1101/2021.05.21.445138",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "recount3: summaries and queries for large-scale RNA-seq expression and splicing",
+    "author": [
+      {
+        "given": "Christopher",
+        "family": "Wilks"
+      },
+      {
+        "given": "Shijie C.",
+        "family": "Zheng"
+      },
+      {
+        "given": "Feng Yong",
+        "family": "Chen"
+      },
+      {
+        "given": "Rone",
+        "family": "Charles"
+      },
+      {
+        "given": "Brad",
+        "family": "Solomon"
+      },
+      {
+        "given": "Jonathan P.",
+        "family": "Ling"
+      },
+      {
+        "given": "Eddie Luidy",
+        "family": "Imada"
+      },
+      {
+        "given": "David",
+        "family": "Zhang"
+      },
+      {
+        "given": "Lance",
+        "family": "Joseph"
+      },
+      {
+        "given": "Jeffrey T.",
+        "family": "Leek"
+      },
+      {
+        "given": "Andrew E.",
+        "family": "Jaffe"
+      },
+      {
+        "given": "Abhinav",
+        "family": "Nellore"
+      },
+      {
+        "given": "Leonardo",
+        "family": "Collado-Torres"
+      },
+      {
+        "given": "Kasper D.",
+        "family": "Hansen"
+      },
+      {
+        "given": "Ben",
+        "family": "Langmead"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          5,
+          23
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gj7cmq",
+    "id": "TPVeG4GP",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.05.21.445138"
+  },
+  {
+    "publisher": "Cold Spring Harbor Laboratory",
+    "abstract": "<jats:title>Abstract</jats:title><jats:p>Understanding genetic factors of complex traits across ancestry groups holds a key to improve the overall health care quality for diverse populations in the United States. In recent years, multiple electronic health record-linked (EHR-linked) biobanks have recruited participants of diverse ancestry backgrounds; these biobanks make it possible to obtain phenome-wide association study (PheWAS) summary statistics on a genome-wide scale for different ancestry groups. Moreover, advancement in bioinformatics methods provide novel means to accelerate the translation of basic discoveries to clinical utility by integrating GWAS summary statistics and expression quantitative trait locus (eQTL) data to identify complex trait-related genes, such as transcriptome-wide association study (TWAS) and colocalization analyses. Here, we combined the advantages of multi-ancestry biobanks and data integrative approaches to investigate the multi-ancestry, gene-disease connection landscape. We first performed a phenome-wide TWAS on Electronic Medical Records and Genomics (eMERGE) III network participants of European ancestry (N = 68,813) and participants of African ancestry (N = 12,658) populations, separately. For each ancestry group, the phenome-wide TWAS tested gene-disease associations between 22,535 genes and 309 curated disease phenotypes in 49 primary human tissues, as well as cross-tissue associations. Next, we identified gene-disease associations that were shared across the two ancestry groups by combining the ancestry-specific results via meta-analyses. We further applied a Bayesian colocalization method, fastENLOC, to prioritize likely functional gene-disease associations with supportive colocalized eQTL and GWAS signals. We replicated the phenome-wide gene-disease analysis in the analogous Penn Medicine BioBank (PMBB) cohorts and sought additional validations in the PhenomeXcan UK Biobank (UKBB) database, PheWAS catalog, and systematic literature review. Phenome-wide TWAS identified many proof-of-concept gene-disease associations, e.g. <jats:italic>FTO</jats:italic>-obesity association (p = 7.29e-15), and numerous novel disease-associated genes, e.g. association between <jats:italic>GATA6-AS1</jats:italic> with pulmonary heart disease (p = 4.60e-10). In short, the multi-ancestry, gene-disease connection landscape provides rich resources for future multi-ancestry complex disease research. We also highlight the importance of expanding the size of non-European ancestry datasets and the potential of exploring ancestry-specific genetic analyses as these will be critical to improve our understanding of the genetic architecture of complex disease.</jats:p>",
+    "DOI": "10.1101/2021.10.21.21265225",
+    "type": "manuscript",
+    "source": "Crossref",
+    "title": "Multi-ancestry gene-trait connection landscape using electronic health record (EHR) linked biobank data",
+    "author": [
+      {
+        "given": "Binglan",
+        "family": "Li"
+      },
+      {
+        "given": "Yogasudha",
+        "family": "Veturi"
+      },
+      {
+        "given": "Anastasia",
+        "family": "Lucas"
+      },
+      {
+        "given": "Yuki",
+        "family": "Bradford"
+      },
+      {
+        "given": "Shefali S.",
+        "family": "Verma"
+      },
+      {
+        "given": "Anurag",
+        "family": "Verma"
+      },
+      {
+        "given": "Joseph",
+        "family": "Park"
+      },
+      {
+        "given": "Wei-Qi",
+        "family": "Wei"
+      },
+      {
+        "given": "Qiping",
+        "family": "Feng"
+      },
+      {
+        "given": "Bahram",
+        "family": "Namjou"
+      },
+      {
+        "given": "Krzysztof",
+        "family": "Kiryluk"
+      },
+      {
+        "given": "Iftikhar",
+        "family": "Kullo"
+      },
+      {
+        "given": "Yuan",
+        "family": "Luo"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Marylyn D.",
+        "family": "Ritchie"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          10,
+          26
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gnbdnb",
+    "id": "gZAOkumx",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1101/2021.10.21.21265225"
+  },
+  {
+    "publisher": "Institute of Electrical and Electronics Engineers (IEEE)",
+    "issue": "6",
+    "DOI": "10.1109/tpami.2005.113",
+    "type": "article-journal",
+    "page": "835-850",
+    "source": "Crossref",
+    "title": "Combining multiple clusterings using evidence accumulation",
+    "volume": "27",
+    "author": [
+      {
+        "given": "Ana L.N.",
+        "family": "Fred"
+      },
+      {
+        "given": "Anil K.",
+        "family": "Jain"
+      }
+    ],
+    "container-title": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          6
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bsknv6",
+    "container-title-short": "IEEE Trans. Pattern Anal. Mach. Intell.",
+    "PMID": "15943417",
+    "id": "cuROQDFa",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1109/tpami.2005.113"
+  },
+  {
+    "publisher": "Institute of Electrical and Electronics Engineers (IEEE)",
+    "issue": "12",
+    "DOI": "10.1109/tpami.2011.84",
+    "type": "article-journal",
+    "page": "2396-2409",
+    "source": "Crossref",
+    "title": "A Link-Based Approach to the Cluster Ensemble Problem",
+    "volume": "33",
+    "author": [
+      {
+        "given": "Natthakan",
+        "family": "Iam-On"
+      },
+      {
+        "given": "Tossapon",
+        "family": "Boongoen"
+      },
+      {
+        "given": "Simon",
+        "family": "Garrett"
+      },
+      {
+        "given": "Chris",
+        "family": "Price"
+      }
+    ],
+    "container-title": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cqgkh3",
+    "container-title-short": "IEEE Trans. Pattern Anal. Mach. Intell.",
+    "PMID": "21576752",
+    "id": "rcTMvL18",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1109/tpami.2011.84"
+  },
+  {
+    "publisher": "Wiley",
+    "issue": "3",
+    "DOI": "10.1111/j.1076-7460.2007.06696.x",
+    "type": "article-journal",
+    "page": "143-149",
+    "source": "Crossref",
+    "title": "Cardiovascular Risk Factors for Alzheimer's Disease",
+    "volume": "16",
+    "author": [
+      {
+        "given": "Clive",
+        "family": "Rosendorff"
+      },
+      {
+        "given": "Michal S.",
+        "family": "Beeri"
+      },
+      {
+        "given": "Jeremy M.",
+        "family": "Silverman"
+      }
+    ],
+    "container-title": "The American Journal of Geriatric Cardiology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2007,
+          3
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bpfw5d",
+    "container-title-short": "Amer J Geriatric Cardiol",
+    "PMID": "17483665",
+    "id": "9BGyO071",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1111/j.1076-7460.2007.06696.x"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "37",
+    "abstract": "<jats:p>PhenomeXcan is a gene-based resource of gene-trait associations, providing biological contexts for translational research.</jats:p>",
+    "DOI": "10.1126/sciadv.aba2083",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "PhenomeXcan: Mapping the genome to the phenome through the transcriptome",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Padma S.",
+        "family": "Rajagopal"
+      },
+      {
+        "given": "Alvaro",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Owen",
+        "family": "Melia"
+      },
+      {
+        "given": "Lisa",
+        "family": "Bastarache"
+      },
+      {
+        "given": "YoSon",
+        "family": "Park"
+      },
+      {
+        "given": "GTEx",
+        "family": "Consortium"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Hae K.",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Science Advances",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghbvbf",
+    "container-title-short": "Sci. Adv.",
+    "PMID": "32917697",
+    "id": "lY5ln3dB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/sciadv.aba2083"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "6259",
+    "abstract": "<jats:title>An Aluring new autoantibody target</jats:title>\n          <jats:p>\n            Autoimmunity is the immune system's ultimate act of betrayal. Cells designed to protect against invading microbes suddenly target the host instead. In the autoimmune disease systemic lupus erythematosus, antibodies target DNA and host proteins, including the RNA binding protein Ro60. Hung\n            <jats:italic>et al.</jats:italic>\n            discovered that Ro60 bound to endogenous Alu retroelements. They detected antibody-Ro60-Alu RNA immune complexes in the blood of individuals with lupus and an enrichment of Alu transcripts. Ro60 bound to Alu probably primes RNA-binding innate immune receptors within B cells, leading these cells to make antibodies that target Ro60-Alu RNA and drive disease-causing inflammation.\n          </jats:p>\n          <jats:p>\n            <jats:italic>Science</jats:italic>\n            , this issue p.\n            <jats:related-article xmlns:xlink=\"http://www.w3.org/1999/xlink\" ext-link-type=\"doi\" issue=\"6259\" page=\"455\" related-article-type=\"in-this-issue\" vol=\"350\" xlink:href=\"10.1126/science.aac7442\">455</jats:related-article>\n          </jats:p>",
+    "DOI": "10.1126/science.aac7442",
+    "type": "article-journal",
+    "page": "455-459",
+    "source": "Crossref",
+    "title": "The Ro60 autoantigen binds endogenous retroelements and regulates inflammatory gene expression",
+    "volume": "350",
+    "author": [
+      {
+        "given": "T.",
+        "family": "Hung"
+      },
+      {
+        "given": "G. A.",
+        "family": "Pratt"
+      },
+      {
+        "given": "B.",
+        "family": "Sundararaman"
+      },
+      {
+        "given": "M. J.",
+        "family": "Townsend"
+      },
+      {
+        "given": "C.",
+        "family": "Chaivorapol"
+      },
+      {
+        "given": "T.",
+        "family": "Bhangale"
+      },
+      {
+        "given": "R. R.",
+        "family": "Graham"
+      },
+      {
+        "given": "W.",
+        "family": "Ortmann"
+      },
+      {
+        "given": "L. A.",
+        "family": "Criswell"
+      },
+      {
+        "given": "G. W.",
+        "family": "Yeo"
+      },
+      {
+        "given": "T. W.",
+        "family": "Behrens"
+      }
+    ],
+    "container-title": "Science",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          10,
+          23
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f7vs67",
+    "container-title-short": "Science",
+    "PMCID": "PMC4691329",
+    "PMID": "26382853",
+    "id": "EnoqU4ga",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/science.aac7442"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "6509",
+    "abstract": "<jats:p>The Genotype-Tissue Expression (GTEx) project dissects how genetic variation affects gene expression and splicing.</jats:p>",
+    "DOI": "10.1126/science.aaz1776",
+    "type": "article-journal",
+    "page": "1318-1330",
+    "source": "Crossref",
+    "title": "The GTEx Consortium atlas of genetic regulatory effects across human tissues",
+    "volume": "369",
+    "author": [
+      {},
+      {
+        "given": "François",
+        "family": "Aguet"
+      },
+      {
+        "given": "Shankara",
+        "family": "Anand"
+      },
+      {
+        "given": "Kristin G.",
+        "family": "Ardlie"
+      },
+      {
+        "given": "Stacey",
+        "family": "Gabriel"
+      },
+      {
+        "given": "Gad A.",
+        "family": "Getz"
+      },
+      {
+        "given": "Aaron",
+        "family": "Graubert"
+      },
+      {
+        "given": "Kane",
+        "family": "Hadley"
+      },
+      {
+        "given": "Robert E.",
+        "family": "Handsaker"
+      },
+      {
+        "given": "Katherine H.",
+        "family": "Huang"
+      },
+      {
+        "given": "Seva",
+        "family": "Kashin"
+      },
+      {
+        "given": "Xiao",
+        "family": "Li"
+      },
+      {
+        "given": "Daniel G.",
+        "family": "MacArthur"
+      },
+      {
+        "given": "Samuel R.",
+        "family": "Meier"
+      },
+      {
+        "given": "Jared L.",
+        "family": "Nedzel"
+      },
+      {
+        "given": "Duyen T.",
+        "family": "Nguyen"
+      },
+      {
+        "given": "Ayellet V.",
+        "family": "Segrè"
+      },
+      {
+        "given": "Ellen",
+        "family": "Todres"
+      },
+      {
+        "given": "Brunilda",
+        "family": "Balliu"
+      },
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Alexis",
+        "family": "Battle"
+      },
+      {
+        "given": "Rodrigo",
+        "family": "Bonazzola"
+      },
+      {
+        "given": "Andrew",
+        "family": "Brown"
+      },
+      {
+        "given": "Christopher D.",
+        "family": "Brown"
+      },
+      {
+        "given": "Stephane E.",
+        "family": "Castel"
+      },
+      {
+        "given": "Donald F.",
+        "family": "Conrad"
+      },
+      {
+        "given": "Daniel J.",
+        "family": "Cotter"
+      },
+      {
+        "given": "Nancy",
+        "family": "Cox"
+      },
+      {
+        "given": "Sayantan",
+        "family": "Das"
+      },
+      {
+        "given": "Olivia M.",
+        "family": "de Goede"
+      },
+      {
+        "given": "Emmanouil T.",
+        "family": "Dermitzakis"
+      },
+      {
+        "given": "Jonah",
+        "family": "Einson"
+      },
+      {
+        "given": "Barbara E.",
+        "family": "Engelhardt"
+      },
+      {
+        "given": "Eleazar",
+        "family": "Eskin"
+      },
+      {
+        "given": "Tiffany Y.",
+        "family": "Eulalio"
+      },
+      {
+        "given": "Nicole M.",
+        "family": "Ferraro"
+      },
+      {
+        "given": "Elise D.",
+        "family": "Flynn"
+      },
+      {
+        "given": "Laure",
+        "family": "Fresard"
+      },
+      {
+        "given": "Eric R.",
+        "family": "Gamazon"
+      },
+      {
+        "given": "Diego",
+        "family": "Garrido-Martín"
+      },
+      {
+        "given": "Nicole R.",
+        "family": "Gay"
+      },
+      {
+        "given": "Michael J.",
+        "family": "Gloudemans"
+      },
+      {
+        "given": "Roderic",
+        "family": "Guigó"
+      },
+      {
+        "given": "Andrew R.",
+        "family": "Hame"
+      },
+      {
+        "given": "Yuan",
+        "family": "He"
+      },
+      {
+        "given": "Paul J.",
+        "family": "Hoffman"
+      },
+      {
+        "given": "Farhad",
+        "family": "Hormozdiari"
+      },
+      {
+        "given": "Lei",
+        "family": "Hou"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      },
+      {
+        "given": "Brian",
+        "family": "Jo"
+      },
+      {
+        "given": "Silva",
+        "family": "Kasela"
+      },
+      {
+        "given": "Manolis",
+        "family": "Kellis"
+      },
+      {
+        "given": "Sarah",
+        "family": "Kim-Hellmuth"
+      },
+      {
+        "given": "Alan",
+        "family": "Kwong"
+      },
+      {
+        "given": "Tuuli",
+        "family": "Lappalainen"
+      },
+      {
+        "given": "Xin",
+        "family": "Li"
+      },
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Serghei",
+        "family": "Mangul"
+      },
+      {
+        "given": "Pejman",
+        "family": "Mohammadi"
+      },
+      {
+        "given": "Stephen B.",
+        "family": "Montgomery"
+      },
+      {
+        "given": "Manuel",
+        "family": "Muñoz-Aguirre"
+      },
+      {
+        "given": "Daniel C.",
+        "family": "Nachun"
+      },
+      {
+        "given": "Andrew B.",
+        "family": "Nobel"
+      },
+      {
+        "given": "Meritxell",
+        "family": "Oliva"
+      },
+      {
+        "given": "YoSon",
+        "family": "Park"
+      },
+      {
+        "given": "Yongjin",
+        "family": "Park"
+      },
+      {
+        "given": "Princy",
+        "family": "Parsana"
+      },
+      {
+        "given": "Abhiram S.",
+        "family": "Rao"
+      },
+      {
+        "given": "Ferran",
+        "family": "Reverter"
+      },
+      {
+        "given": "John M.",
+        "family": "Rouhana"
+      },
+      {
+        "given": "Chiara",
+        "family": "Sabatti"
+      },
+      {
+        "given": "Ashis",
+        "family": "Saha"
+      },
+      {
+        "given": "Matthew",
+        "family": "Stephens"
+      },
+      {
+        "given": "Barbara E.",
+        "family": "Stranger"
+      },
+      {
+        "given": "Benjamin J.",
+        "family": "Strober"
+      },
+      {
+        "given": "Nicole A.",
+        "family": "Teran"
+      },
+      {
+        "given": "Ana",
+        "family": "Viñuela"
+      },
+      {
+        "given": "Gao",
+        "family": "Wang"
+      },
+      {
+        "given": "Xiaoquan",
+        "family": "Wen"
+      },
+      {
+        "given": "Fred",
+        "family": "Wright"
+      },
+      {
+        "given": "Valentin",
+        "family": "Wucher"
+      },
+      {
+        "given": "Yuxin",
+        "family": "Zou"
+      },
+      {
+        "given": "Pedro G.",
+        "family": "Ferreira"
+      },
+      {
+        "given": "Gen",
+        "family": "Li"
+      },
+      {
+        "given": "Marta",
+        "family": "Melé"
+      },
+      {
+        "given": "Esti",
+        "family": "Yeger-Lotem"
+      },
+      {
+        "given": "Mary E.",
+        "family": "Barcus"
+      },
+      {
+        "given": "Debra",
+        "family": "Bradbury"
+      },
+      {
+        "given": "Tanya",
+        "family": "Krubit"
+      },
+      {
+        "given": "Jeffrey A.",
+        "family": "McLean"
+      },
+      {
+        "given": "Liqun",
+        "family": "Qi"
+      },
+      {
+        "given": "Karna",
+        "family": "Robinson"
+      },
+      {
+        "given": "Nancy V.",
+        "family": "Roche"
+      },
+      {
+        "given": "Anna M.",
+        "family": "Smith"
+      },
+      {
+        "given": "Leslie",
+        "family": "Sobin"
+      },
+      {
+        "given": "David E.",
+        "family": "Tabor"
+      },
+      {
+        "given": "Anita",
+        "family": "Undale"
+      },
+      {
+        "given": "Jason",
+        "family": "Bridge"
+      },
+      {
+        "given": "Lori E.",
+        "family": "Brigham"
+      },
+      {
+        "given": "Barbara A.",
+        "family": "Foster"
+      },
+      {
+        "given": "Bryan M.",
+        "family": "Gillard"
+      },
+      {
+        "given": "Richard",
+        "family": "Hasz"
+      },
+      {
+        "given": "Marcus",
+        "family": "Hunter"
+      },
+      {
+        "given": "Christopher",
+        "family": "Johns"
+      },
+      {
+        "given": "Mark",
+        "family": "Johnson"
+      },
+      {
+        "given": "Ellen",
+        "family": "Karasik"
+      },
+      {
+        "given": "Gene",
+        "family": "Kopen"
+      },
+      {
+        "given": "William F.",
+        "family": "Leinweber"
+      },
+      {
+        "given": "Alisa",
+        "family": "McDonald"
+      },
+      {
+        "given": "Michael T.",
+        "family": "Moser"
+      },
+      {
+        "given": "Kevin",
+        "family": "Myer"
+      },
+      {
+        "given": "Kimberley D.",
+        "family": "Ramsey"
+      },
+      {
+        "given": "Brian",
+        "family": "Roe"
+      },
+      {
+        "given": "Saboor",
+        "family": "Shad"
+      },
+      {
+        "given": "Jeffrey A.",
+        "family": "Thomas"
+      },
+      {
+        "given": "Gary",
+        "family": "Walters"
+      },
+      {
+        "given": "Michael",
+        "family": "Washington"
+      },
+      {
+        "given": "Joseph",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Scott D.",
+        "family": "Jewell"
+      },
+      {
+        "given": "Daniel C.",
+        "family": "Rohrer"
+      },
+      {
+        "given": "Dana R.",
+        "family": "Valley"
+      },
+      {
+        "given": "David A.",
+        "family": "Davis"
+      },
+      {
+        "given": "Deborah C.",
+        "family": "Mash"
+      },
+      {
+        "given": "Philip A.",
+        "family": "Branton"
+      },
+      {
+        "given": "Laura K.",
+        "family": "Barker"
+      },
+      {
+        "given": "Heather M.",
+        "family": "Gardiner"
+      },
+      {
+        "given": "Maghboeba",
+        "family": "Mosavel"
+      },
+      {
+        "given": "Laura A.",
+        "family": "Siminoff"
+      },
+      {
+        "given": "Paul",
+        "family": "Flicek"
+      },
+      {
+        "given": "Maximilian",
+        "family": "Haeussler"
+      },
+      {
+        "given": "Thomas",
+        "family": "Juettemann"
+      },
+      {
+        "given": "W. James",
+        "family": "Kent"
+      },
+      {
+        "given": "Christopher M.",
+        "family": "Lee"
+      },
+      {
+        "given": "Conner C.",
+        "family": "Powell"
+      },
+      {
+        "given": "Kate R.",
+        "family": "Rosenbloom"
+      },
+      {
+        "given": "Magali",
+        "family": "Ruffier"
+      },
+      {
+        "given": "Dan",
+        "family": "Sheppard"
+      },
+      {
+        "given": "Kieron",
+        "family": "Taylor"
+      },
+      {
+        "given": "Stephen J.",
+        "family": "Trevanion"
+      },
+      {
+        "given": "Daniel R.",
+        "family": "Zerbino"
+      },
+      {
+        "given": "Nathan S.",
+        "family": "Abell"
+      },
+      {
+        "given": "Joshua",
+        "family": "Akey"
+      },
+      {
+        "given": "Lin",
+        "family": "Chen"
+      },
+      {
+        "given": "Kathryn",
+        "family": "Demanelis"
+      },
+      {
+        "given": "Jennifer A.",
+        "family": "Doherty"
+      },
+      {
+        "given": "Andrew P.",
+        "family": "Feinberg"
+      },
+      {
+        "given": "Kasper D.",
+        "family": "Hansen"
+      },
+      {
+        "given": "Peter F.",
+        "family": "Hickey"
+      },
+      {
+        "given": "Farzana",
+        "family": "Jasmine"
+      },
+      {
+        "given": "Lihua",
+        "family": "Jiang"
+      },
+      {
+        "given": "Rajinder",
+        "family": "Kaul"
+      },
+      {
+        "given": "Muhammad G.",
+        "family": "Kibriya"
+      },
+      {
+        "given": "Jin Billy",
+        "family": "Li"
+      },
+      {
+        "given": "Qin",
+        "family": "Li"
+      },
+      {
+        "given": "Shin",
+        "family": "Lin"
+      },
+      {
+        "given": "Sandra E.",
+        "family": "Linder"
+      },
+      {
+        "given": "Brandon L.",
+        "family": "Pierce"
+      },
+      {
+        "given": "Lindsay F.",
+        "family": "Rizzardi"
+      },
+      {
+        "given": "Andrew D.",
+        "family": "Skol"
+      },
+      {
+        "given": "Kevin S.",
+        "family": "Smith"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      },
+      {
+        "given": "John",
+        "family": "Stamatoyannopoulos"
+      },
+      {
+        "given": "Hua",
+        "family": "Tang"
+      },
+      {
+        "given": "Meng",
+        "family": "Wang"
+      },
+      {
+        "given": "Latarsha J.",
+        "family": "Carithers"
+      },
+      {
+        "given": "Ping",
+        "family": "Guan"
+      },
+      {
+        "given": "Susan E.",
+        "family": "Koester"
+      },
+      {
+        "given": "A. Roger",
+        "family": "Little"
+      },
+      {
+        "given": "Helen M.",
+        "family": "Moore"
+      },
+      {
+        "given": "Concepcion R.",
+        "family": "Nierras"
+      },
+      {
+        "given": "Abhi K.",
+        "family": "Rao"
+      },
+      {
+        "given": "Jimmie B.",
+        "family": "Vaught"
+      },
+      {
+        "given": "Simona",
+        "family": "Volpi"
+      }
+    ],
+    "container-title": "Science",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          9,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghbnhr",
+    "container-title-short": "Science",
+    "PMCID": "PMC7737656",
+    "PMID": "32913098",
+    "id": "9Pr9idng",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/science.aaz1776"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "432",
+    "abstract": "<jats:p>Properly applied, clustering methods reveal meaning in high-throughput biological data.</jats:p>",
+    "DOI": "10.1126/scisignal.aad1932",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Avoiding common pitfalls when clustering biological data",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Tom",
+        "family": "Ronan"
+      },
+      {
+        "given": "Zhijie",
+        "family": "Qi"
+      },
+      {
+        "given": "Kristen M.",
+        "family": "Naegle"
+      }
+    ],
+    "container-title": "Science Signaling",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcvjr6",
+    "container-title-short": "Sci. Signal.",
+    "PMID": "27303057",
+    "id": "14dCeRkua",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scisignal.aad1932"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "96",
+    "abstract": "<jats:p>A systematic computational method predicts new uses for existing drugs by integrating public gene expression signatures of drugs and diseases.</jats:p>",
+    "DOI": "10.1126/scitranslmed.3001318",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Discovery and Preclinical Validation of Drug Indications Using Compendia of Public Gene Expression Data",
+    "volume": "3",
+    "author": [
+      {
+        "given": "Marina",
+        "family": "Sirota"
+      },
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      },
+      {
+        "given": "Jeewon",
+        "family": "Kim"
+      },
+      {
+        "given": "Annie P.",
+        "family": "Chiang"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Morgan"
+      },
+      {
+        "given": "Alejandro",
+        "family": "Sweet-Cordero"
+      },
+      {
+        "given": "Julien",
+        "family": "Sage"
+      },
+      {
+        "given": "Atul J.",
+        "family": "Butte"
+      }
+    ],
+    "container-title": "Science Translational Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          8,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/c3fwxv",
+    "container-title-short": "Sci. Transl. Med.",
+    "PMCID": "PMC3502016",
+    "PMID": "21849665",
+    "id": "mZjkE1xU",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scitranslmed.3001318"
+  },
+  {
+    "publisher": "American Association for the Advancement of Science (AAAS)",
+    "issue": "96",
+    "abstract": "<jats:p>Computationally predicted repositioning of an anticonvulsant for inflammatory bowel disease is confirmed experimentally.</jats:p>",
+    "DOI": "10.1126/scitranslmed.3002648",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Computational Repositioning of the Anticonvulsant Topiramate for Inflammatory Bowel Disease",
+    "volume": "3",
+    "author": [
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      },
+      {
+        "given": "Marina",
+        "family": "Sirota"
+      },
+      {
+        "given": "Mohan",
+        "family": "Shenoy"
+      },
+      {
+        "given": "Reetesh K.",
+        "family": "Pai"
+      },
+      {
+        "given": "Silke",
+        "family": "Roedder"
+      },
+      {
+        "given": "Annie P.",
+        "family": "Chiang"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Morgan"
+      },
+      {
+        "given": "Minnie M.",
+        "family": "Sarwal"
+      },
+      {
+        "given": "Pankaj Jay",
+        "family": "Pasricha"
+      },
+      {
+        "given": "Atul J.",
+        "family": "Butte"
+      }
+    ],
+    "container-title": "Science Translational Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          8,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/bmh5ts",
+    "container-title-short": "Sci. Transl. Med.",
+    "PMCID": "PMC3479650",
+    "PMID": "21849664",
+    "id": "1ClBKizD7",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1126/scitranslmed.3002648"
+  },
+  {
+    "publisher": "S. Karger AG",
+    "issue": "6",
+    "abstract": "<jats:p>Background/Aims: Over 99% of mouse and human ovarian follicles will undergo specialized cell death including atresia and apoptosis. Reduction of apoptosis may help reduce infertility and maintain the reproductive ability in women. Methods: 3-day B6D2F1 mice were used to culture small follicle and ovary tissue with niacin and 18-day mice were intraperitoneal injected with niacin to determine its effect on follicle development. Then establish 8-weeks POF animal model with cytoxan (CTX) or radiation. Treatment group was given 0.1 mL of 100 mM niacin by an intraperitoneal injection twice before ovulation. The ovaries were collected and the follicles were counted and categorized, and ovarian histologic sections were stained for TUNEL. Ovarian function was then evaluated by monitoring ovulation. Microarray analyses, Western blot, immunofluorescence and real-time quantitative PCR were used to assess the mechanism of ovarian injury and repair. Results: We found that niacin promotes follicle growth in the immature oocyte and it increased the levels of a germ-line cell marker DDX4, and a cell proliferation marker PCNA in the ovary. Addition of niacin to the cell culture reduced oocyte apoptosis in vitro. Administration of niacin to treat premature ovarian failure (POF) in mouse models showed inhibition of follicular apoptosis under harmful conditions, such as radiation and chemotherapy damage, by markedly reducing cumulus cell apoptosis. Additionally, the number of developing follicles increased after administration of niacin. Conclusion: Niacin may have an important function in treating POF by reducing apoptosis in clinical applications.</jats:p>",
+    "DOI": "10.1159/000495051",
+    "type": "article-journal",
+    "page": "2060-2070",
+    "source": "Crossref",
+    "title": "Niacin Inhibits Apoptosis and Rescues Premature Ovarian Failure",
+    "volume": "50",
+    "author": [
+      {
+        "given": "Shufang",
+        "family": "Wang"
+      },
+      {
+        "given": "Min",
+        "family": "Sun"
+      },
+      {
+        "given": "Ling",
+        "family": "Yu"
+      },
+      {
+        "given": "Yixuan",
+        "family": "Wang"
+      },
+      {
+        "given": "Yuanqing",
+        "family": "Yao"
+      },
+      {
+        "given": "Deqing",
+        "family": "Wang"
+      }
+    ],
+    "container-title": "Cellular Physiology and Biochemistry",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2018
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gfqvcq",
+    "container-title-short": "Cell Physiol Biochem",
+    "PMID": "30415247",
+    "id": "kLRErKXz",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1159/000495051"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "2",
+    "abstract": "<jats:p>Atherosclerosis has been characterized as a chronic inflammatory response to cholesterol deposition in arteries, but the mechanisms linking cholesterol accumulation in macrophage foam cells to inflammation are poorly understood. Macrophage cholesterol efflux occurs at all stages of atherosclerosis and protects cells from free cholesterol and oxysterol-induced toxicity. The ATP-binding cassette transporters ABCA1 and ABCG1 are responsible for the major part of macrophage cholesterol efflux to serum or HDL in macrophage foam cells, but other less efficient pathways such as passive efflux are also involved. Recent studies have shown that the sterol efflux activities of ABCA1 and ABCG1 modulate macrophage expression of inflammatory cytokines and chemokines as well as lymphocyte proliferative responses. In macrophages, transporter deficiency causes increased signaling via various Toll-like receptors including TLR4. These studies have shown that the traditional roles of HDL and ABC transporters in cholesterol efflux and reverse cholesterol transport are mechanistically linked to antiinflammatory and immunosuppressive functions of HDL. The underlying mechanisms may involve modulation of sterol levels and lipid organization in cell membranes.</jats:p>",
+    "DOI": "10.1161/atvbaha.108.179283",
+    "type": "article-journal",
+    "page": "139-143",
+    "source": "Crossref",
+    "title": "Role of HDL, ABCA1, and ABCG1 Transporters in Cholesterol Efflux and Immune Responses",
+    "volume": "30",
+    "author": [
+      {
+        "given": "Laurent",
+        "family": "Yvan-Charvet"
+      },
+      {
+        "given": "Nan",
+        "family": "Wang"
+      },
+      {
+        "given": "Alan R.",
+        "family": "Tall"
+      }
+    ],
+    "container-title": "Arteriosclerosis, Thrombosis, and Vascular Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          2
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ds23w6",
+    "container-title-short": "ATVB",
+    "PMCID": "PMC2812788",
+    "PMID": "19797709",
+    "id": "1DblG8swn",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/atvbaha.108.179283"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "1",
+    "abstract": "<jats:p>In a somewhat narrow diagnostic lens, Alzheimer disease (AD) has been considered a brain-specific disease characterized by the presence of Aβ (β-amyloid) plaques and tau neural fibrillary tangles and neural inflammation; these pathologies lead to neuronal death and consequently clinical symptoms, such as memory loss, confusion, and impaired cognitive function. However, for decades, researchers have noticed a link between various cardiovascular abnormalities and AD—such as heart failure, coronary artery disease, atrial fibrillation, and vasculopathy. A considerable volume of work has pointed at this head to heart connection, focusing mainly on associations between cerebral hypoperfusion and neuronal degradation. However, new evidence of a possible systemic or metastatic profile to AD calls for further analysis of this connection. Aβ aggregations—biochemically and structurally akin to those found in the typical AD pathology—are now known to be present in the hearts of individuals with idiopathic dilated cardiomyopathy, as well as the hearts of patients with AD. These findings suggest a potential systemic profile of proteinopathies and a new hypothesis for the link between peripheral and central symptoms of heart failure and AD. Herein, we provide an overview of the cardiovascular links to Alzheimer disease.</jats:p>",
+    "DOI": "10.1161/circresaha.118.313563",
+    "type": "article-journal",
+    "page": "142-149",
+    "source": "Crossref",
+    "title": "Getting to the Heart of Alzheimer Disease",
+    "volume": "124",
+    "author": [
+      {
+        "given": "Joshua M.",
+        "family": "Tublin"
+      },
+      {
+        "given": "Jeremy M.",
+        "family": "Adelstein"
+      },
+      {
+        "given": "Federica",
+        "family": "del Monte"
+      },
+      {
+        "given": "Colin K.",
+        "family": "Combs"
+      },
+      {
+        "given": "Loren E.",
+        "family": "Wold"
+      }
+    ],
+    "container-title": "Circulation Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          1,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjzjgq",
+    "container-title-short": "Circ Res",
+    "PMCID": "PMC6319653",
+    "PMID": "30605407",
+    "id": "13t4TuFeJ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/circresaha.118.313563"
+  },
+  {
+    "publisher": "Ovid Technologies (Wolters Kluwer Health)",
+    "issue": "2",
+    "abstract": "<jats:sec>\n            <jats:title>Background—</jats:title>\n            <jats:p>Depression, anxiety, and psychotic disorders have been associated with an increased risk of coronary heart disease (CHD). It is unclear whether this association between mental health and CHD is present across a wider range of mental disorders.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Methods and Results—</jats:title>\n            <jats:p>Participants were 1 107 524 Swedish men conscripted at a mean age of 18.3 years. Mental disorders were assessed by psychiatric interview on conscription, and data on hospital admissions for mental disorder and CHD were obtained from national registers during 22.6 years of follow-up. An increased risk of incident CHD was evident across a range of mental disorders whether diagnosed at conscription or on later hospital admission. Age-adjusted hazard ratios (95% confidence intervals) according to diagnoses at conscription ranged from 1.30 (1.05–1.62) (depressive disorders) to 1.90 (1.58–2.28) (alcohol-related disorders). The equivalent figures according to diagnoses during hospital admission ranged from 1.49 (1.24–1.80) (schizophrenia) to 2.82 (2.53–3.13) (other substance use disorders). Associations were little changed by adjustment for parental socioeconomic status, or body mass index, diabetes mellitus, and blood pressure measured at conscription, but they were partially attenuated by the adjustment for smoking, alcohol intake, and intelligence measured at conscription, and for education and own socioeconomic position.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Conclusions—</jats:title>\n            <jats:p>Increased risk of incident CHD is present across a range of mental disorders and is observable when the disorders are diagnosed at a young age.</jats:p>\n          </jats:sec>",
+    "DOI": "10.1161/circulationaha.113.002065",
+    "type": "article-journal",
+    "page": "186-193",
+    "source": "Crossref",
+    "title": "Mental Disorders Across the Adult Life Course and Future Coronary Heart Disease",
+    "volume": "129",
+    "author": [
+      {
+        "given": "Catharine R.",
+        "family": "Gale"
+      },
+      {
+        "given": "G. David",
+        "family": "Batty"
+      },
+      {
+        "given": "David P. J.",
+        "family": "Osborn"
+      },
+      {
+        "given": "Per",
+        "family": "Tynelius"
+      },
+      {
+        "given": "Finn",
+        "family": "Rasmussen"
+      }
+    ],
+    "container-title": "Circulation",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          1,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/qm4",
+    "container-title-short": "Circulation",
+    "PMCID": "PMC4107269",
+    "PMID": "24190959",
+    "id": "j2Sl4DAE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1161/circulationaha.113.002065"
+  },
+  {
+    "publisher": "American Society for Clinical Investigation",
+    "issue": "3",
+    "DOI": "10.1172/jci41651",
+    "type": "article-journal",
+    "page": "1163-1173",
+    "source": "Crossref",
+    "title": "Nicotinic acid inhibits progression of atherosclerosis in mice through its receptor GPR109A expressed by immune cells",
+    "volume": "121",
+    "author": [
+      {
+        "given": "Martina",
+        "family": "Lukasova"
+      },
+      {
+        "given": "Camille",
+        "family": "Malaval"
+      },
+      {
+        "given": "Andreas",
+        "family": "Gille"
+      },
+      {
+        "given": "Jukka",
+        "family": "Kero"
+      },
+      {
+        "given": "Stefan",
+        "family": "Offermanns"
+      }
+    ],
+    "container-title": "Journal of Clinical Investigation",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          3,
+          1
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cqftcq",
+    "container-title-short": "J. Clin. Invest.",
+    "PMCID": "PMC3048854",
+    "PMID": "21317532",
+    "id": "1Bz0jRHYo",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1172/jci41651"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title>\n          <jats:sec>\n            <jats:title>Background</jats:title>\n            <jats:p>Accurate evaluation of the quality of genomic or proteomic data and computational methods is vital to our ability to use them for formulating novel biological hypotheses and directing further experiments. There is currently no standard approach to evaluation in functional genomics. Our analysis of existing approaches shows that they are inconsistent and contain substantial functional biases that render the resulting evaluations misleading both quantitatively and qualitatively. These problems make it essentially impossible to compare computational methods or large-scale experimental datasets and also result in conclusions that generalize poorly in most biological applications.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Results</jats:title>\n            <jats:p>We reveal issues with current evaluation methods here and suggest new approaches to evaluation that facilitate accurate and representative characterization of genomic methods and data. Specifically, we describe a functional genomics gold standard based on curation by expert biologists and demonstrate its use as an effective means of evaluation of genomic approaches. Our evaluation framework and gold standard are freely available to the community through our website.</jats:p>\n          </jats:sec>\n          <jats:sec>\n            <jats:title>Conclusion</jats:title>\n            <jats:p>Proper methods for evaluating genomic data and computational approaches will determine how much we, as a community, are able to learn from the wealth of available data. We propose one possible solution to this problem here but emphasize that this topic warrants broader community discussion.</jats:p>\n          </jats:sec>",
+    "DOI": "10.1186/1471-2164-7-187",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Finding function: evaluation methods for functional genomic data",
+    "volume": "7",
+    "author": [
+      {
+        "given": "Chad L",
+        "family": "Myers"
+      },
+      {
+        "given": "Daniel R",
+        "family": "Barrett"
+      },
+      {
+        "given": "Matthew A",
+        "family": "Hibbs"
+      },
+      {
+        "given": "Curtis",
+        "family": "Huttenhower"
+      },
+      {
+        "given": "Olga G",
+        "family": "Troyanskaya"
+      }
+    ],
+    "container-title": "BMC Genomics",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2006,
+          7,
+          25
+        ]
+      ]
+    },
+    "URL": "https://doi.org/fg6wnk",
+    "container-title-short": "BMC Genomics",
+    "PMCID": "PMC1560386",
+    "PMID": "16869964",
+    "id": "1FVd2WW6G",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/1471-2164-7-187"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1186/s12916-014-0206-2",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "The overlap between vascular disease and Alzheimer’s disease - lessons from pathology",
+    "volume": "12",
+    "author": [
+      {
+        "given": "Johannes",
+        "family": "Attems"
+      },
+      {
+        "given": "Kurt A",
+        "family": "Jellinger"
+      }
+    ],
+    "container-title": "BMC Medicine",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          11,
+          11
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f6pjd4",
+    "container-title-short": "BMC Med",
+    "PMCID": "PMC4226890",
+    "PMID": "25385447",
+    "id": "D83Aqhga",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s12916-014-0206-2"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>Mapping disease-associated genetic variants to complex disease pathophysiology is a major challenge in translating findings from genome-wide association studies into novel therapeutic opportunities. The difficulty lies in our limited understanding of how phenotypic traits arise from non-coding genetic variants in highly organized biological systems with heterogeneous gene expression across cells and tissues.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>We present a novel strategy, called GWAS component analysis, for transferring disease associations from single-nucleotide polymorphisms to co-expression modules by stacking models trained using reference genome and tissue-specific gene expression data. Application of this method to genome-wide association studies of blood cell counts confirmed that it could detect gene sets enriched in expected cell types. In addition, coupling of our method with Bayesian networks enables GWAS components to be used to discover drug targets.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusions</jats:title>\n                <jats:p>We tested genome-wide associations of four disease phenotypes, including age-related macular degeneration, Crohn’s disease, ulcerative colitis and rheumatoid arthritis, and demonstrated the proposed method could select more functional genes than S-PrediXcan, the previous single-step model for predicting gene-level associations from SNP-level associations.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13040-020-00216-9",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Identification of therapeutic targets from genetic association studies using hierarchical component analysis",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Hao-Chih",
+        "family": "Lee"
+      },
+      {
+        "given": "Osamu",
+        "family": "Ichikawa"
+      },
+      {
+        "given": "Benjamin S.",
+        "family": "Glicksberg"
+      },
+      {
+        "given": "Aparna A.",
+        "family": "Divaraniya"
+      },
+      {
+        "given": "Christine E.",
+        "family": "Becker"
+      },
+      {
+        "given": "Pankaj",
+        "family": "Agarwal"
+      },
+      {
+        "given": "Joel T.",
+        "family": "Dudley"
+      }
+    ],
+    "container-title": "BioData Mining",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          6,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjp5pf",
+    "container-title-short": "BioData Mining",
+    "PMCID": "PMC7301559",
+    "PMID": "32565911",
+    "id": "57TjOMEA",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13040-020-00216-9"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "DOI": "10.1186/s13059-016-1070-5",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Estimating the population abundance of tissue-infiltrating immune and stromal cell populations using gene expression",
+    "volume": "17",
+    "author": [
+      {
+        "given": "Etienne",
+        "family": "Becht"
+      },
+      {
+        "given": "Nicolas A.",
+        "family": "Giraldo"
+      },
+      {
+        "given": "Laetitia",
+        "family": "Lacroix"
+      },
+      {
+        "given": "Bénédicte",
+        "family": "Buttard"
+      },
+      {
+        "given": "Nabila",
+        "family": "Elarouci"
+      },
+      {
+        "given": "Florent",
+        "family": "Petitprez"
+      },
+      {
+        "given": "Janick",
+        "family": "Selves"
+      },
+      {
+        "given": "Pierre",
+        "family": "Laurent-Puig"
+      },
+      {
+        "given": "Catherine",
+        "family": "Sautès-Fridman"
+      },
+      {
+        "given": "Wolf H.",
+        "family": "Fridman"
+      },
+      {
+        "given": "Aurélien",
+        "family": "de Reyniès"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          10,
+          20
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f87sgf",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC5073889",
+    "PMID": "27765066",
+    "id": "18TSqd1tG",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-016-1070-5"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>The Critical Assessment of Functional Annotation (CAFA) is an ongoing, global, community-driven effort to evaluate and improve the computational annotation of protein function.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>Here, we report on the results of the third CAFA challenge, CAFA3, that featured an expanded analysis over the previous CAFA rounds, both in terms of volume of data analyzed and the types of analysis performed. In a novel and major new development, computational predictions and assessment goals drove some of the experimental assays, resulting in new functional annotations for more than 1000 genes. Specifically, we performed experimental whole-genome mutation screening in <jats:italic>Candida albicans</jats:italic> and <jats:italic>Pseudomonas aureginosa</jats:italic> genomes, which provided us with genome-wide experimental data for genes associated with biofilm formation and motility. We further performed targeted assays on selected genes in <jats:italic>Drosophila melanogaster</jats:italic>, which we suspected of being involved in long-term memory.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusion</jats:title>\n                <jats:p>We conclude that while predictions of the molecular function and biological process annotations have slightly improved over time, those of the cellular component have not. Term-centric prediction of experimental annotations remains equally challenging; although the performance of the top methods is significantly better than the expectations set by baseline methods in <jats:italic>C. albicans</jats:italic> and <jats:italic>D. melanogaster</jats:italic>, it leaves considerable room and need for improvement. Finally, we report that the CAFA community now involves a broad range of participants with expertise in bioinformatics, biological experimentation, biocuration, and bio-ontologies, working together to improve functional annotation, computational function prediction, and our ability to manage big data in the era of large experimental screens.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13059-019-1835-8",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "The CAFA challenge reports improved protein function prediction and new functional annotations for hundreds of genes through experimental screens",
+    "volume": "20",
+    "author": [
+      {
+        "given": "Naihui",
+        "family": "Zhou"
+      },
+      {
+        "given": "Yuxiang",
+        "family": "Jiang"
+      },
+      {
+        "given": "Timothy R.",
+        "family": "Bergquist"
+      },
+      {
+        "given": "Alexandra J.",
+        "family": "Lee"
+      },
+      {
+        "given": "Balint Z.",
+        "family": "Kacsoh"
+      },
+      {
+        "given": "Alex W.",
+        "family": "Crocker"
+      },
+      {
+        "given": "Kimberley A.",
+        "family": "Lewis"
+      },
+      {
+        "given": "George",
+        "family": "Georghiou"
+      },
+      {
+        "given": "Huy N.",
+        "family": "Nguyen"
+      },
+      {
+        "given": "Md Nafiz",
+        "family": "Hamid"
+      },
+      {
+        "given": "Larry",
+        "family": "Davis"
+      },
+      {
+        "given": "Tunca",
+        "family": "Dogan"
+      },
+      {
+        "given": "Volkan",
+        "family": "Atalay"
+      },
+      {
+        "given": "Ahmet S.",
+        "family": "Rifaioglu"
+      },
+      {
+        "given": "Alperen",
+        "family": "Dalkıran"
+      },
+      {
+        "given": "Rengul",
+        "family": "Cetin Atalay"
+      },
+      {
+        "given": "Chengxin",
+        "family": "Zhang"
+      },
+      {
+        "given": "Rebecca L.",
+        "family": "Hurto"
+      },
+      {
+        "given": "Peter L.",
+        "family": "Freddolino"
+      },
+      {
+        "given": "Yang",
+        "family": "Zhang"
+      },
+      {
+        "given": "Prajwal",
+        "family": "Bhat"
+      },
+      {
+        "given": "Fran",
+        "family": "Supek"
+      },
+      {
+        "given": "José M.",
+        "family": "Fernández"
+      },
+      {
+        "given": "Branislava",
+        "family": "Gemovic"
+      },
+      {
+        "given": "Vladimir R.",
+        "family": "Perovic"
+      },
+      {
+        "given": "Radoslav S.",
+        "family": "Davidović"
+      },
+      {
+        "given": "Neven",
+        "family": "Sumonja"
+      },
+      {
+        "given": "Nevena",
+        "family": "Veljkovic"
+      },
+      {
+        "given": "Ehsaneddin",
+        "family": "Asgari"
+      },
+      {
+        "given": "Mohammad R.K.",
+        "family": "Mofrad"
+      },
+      {
+        "given": "Giuseppe",
+        "family": "Profiti"
+      },
+      {
+        "given": "Castrense",
+        "family": "Savojardo"
+      },
+      {
+        "given": "Pier Luigi",
+        "family": "Martelli"
+      },
+      {
+        "given": "Rita",
+        "family": "Casadio"
+      },
+      {
+        "given": "Florian",
+        "family": "Boecker"
+      },
+      {
+        "given": "Heiko",
+        "family": "Schoof"
+      },
+      {
+        "given": "Indika",
+        "family": "Kahanda"
+      },
+      {
+        "given": "Natalie",
+        "family": "Thurlby"
+      },
+      {
+        "given": "Alice C.",
+        "family": "McHardy"
+      },
+      {
+        "given": "Alexandre",
+        "family": "Renaux"
+      },
+      {
+        "given": "Rabie",
+        "family": "Saidi"
+      },
+      {
+        "given": "Julian",
+        "family": "Gough"
+      },
+      {
+        "given": "Alex A.",
+        "family": "Freitas"
+      },
+      {
+        "given": "Magdalena",
+        "family": "Antczak"
+      },
+      {
+        "given": "Fabio",
+        "family": "Fabris"
+      },
+      {
+        "given": "Mark N.",
+        "family": "Wass"
+      },
+      {
+        "given": "Jie",
+        "family": "Hou"
+      },
+      {
+        "given": "Jianlin",
+        "family": "Cheng"
+      },
+      {
+        "given": "Zheng",
+        "family": "Wang"
+      },
+      {
+        "given": "Alfonso E.",
+        "family": "Romero"
+      },
+      {
+        "given": "Alberto",
+        "family": "Paccanaro"
+      },
+      {
+        "given": "Haixuan",
+        "family": "Yang"
+      },
+      {
+        "given": "Tatyana",
+        "family": "Goldberg"
+      },
+      {
+        "given": "Chenguang",
+        "family": "Zhao"
+      },
+      {
+        "given": "Liisa",
+        "family": "Holm"
+      },
+      {
+        "given": "Petri",
+        "family": "Törönen"
+      },
+      {
+        "given": "Alan J.",
+        "family": "Medlar"
+      },
+      {
+        "given": "Elaine",
+        "family": "Zosa"
+      },
+      {
+        "given": "Itamar",
+        "family": "Borukhov"
+      },
+      {
+        "given": "Ilya",
+        "family": "Novikov"
+      },
+      {
+        "given": "Angela",
+        "family": "Wilkins"
+      },
+      {
+        "given": "Olivier",
+        "family": "Lichtarge"
+      },
+      {
+        "given": "Po-Han",
+        "family": "Chi"
+      },
+      {
+        "given": "Wei-Cheng",
+        "family": "Tseng"
+      },
+      {
+        "given": "Michal",
+        "family": "Linial"
+      },
+      {
+        "given": "Peter W.",
+        "family": "Rose"
+      },
+      {
+        "given": "Christophe",
+        "family": "Dessimoz"
+      },
+      {
+        "given": "Vedrana",
+        "family": "Vidulin"
+      },
+      {
+        "given": "Saso",
+        "family": "Dzeroski"
+      },
+      {
+        "given": "Ian",
+        "family": "Sillitoe"
+      },
+      {
+        "given": "Sayoni",
+        "family": "Das"
+      },
+      {
+        "given": "Jonathan Gill",
+        "family": "Lees"
+      },
+      {
+        "given": "David T.",
+        "family": "Jones"
+      },
+      {
+        "given": "Cen",
+        "family": "Wan"
+      },
+      {
+        "given": "Domenico",
+        "family": "Cozzetto"
+      },
+      {
+        "given": "Rui",
+        "family": "Fa"
+      },
+      {
+        "given": "Mateo",
+        "family": "Torres"
+      },
+      {
+        "given": "Alex",
+        "family": "Warwick Vesztrocy"
+      },
+      {
+        "given": "Jose Manuel",
+        "family": "Rodriguez"
+      },
+      {
+        "given": "Michael L.",
+        "family": "Tress"
+      },
+      {
+        "given": "Marco",
+        "family": "Frasca"
+      },
+      {
+        "given": "Marco",
+        "family": "Notaro"
+      },
+      {
+        "given": "Giuliano",
+        "family": "Grossi"
+      },
+      {
+        "given": "Alessandro",
+        "family": "Petrini"
+      },
+      {
+        "given": "Matteo",
+        "family": "Re"
+      },
+      {
+        "given": "Giorgio",
+        "family": "Valentini"
+      },
+      {
+        "given": "Marco",
+        "family": "Mesiti"
+      },
+      {
+        "given": "Daniel B.",
+        "family": "Roche"
+      },
+      {
+        "given": "Jonas",
+        "family": "Reeb"
+      },
+      {
+        "given": "David W.",
+        "family": "Ritchie"
+      },
+      {
+        "given": "Sabeur",
+        "family": "Aridhi"
+      },
+      {
+        "given": "Seyed Ziaeddin",
+        "family": "Alborzi"
+      },
+      {
+        "given": "Marie-Dominique",
+        "family": "Devignes"
+      },
+      {
+        "given": "Da Chen Emily",
+        "family": "Koo"
+      },
+      {
+        "given": "Richard",
+        "family": "Bonneau"
+      },
+      {
+        "given": "Vladimir",
+        "family": "Gligorijević"
+      },
+      {
+        "given": "Meet",
+        "family": "Barot"
+      },
+      {
+        "given": "Hai",
+        "family": "Fang"
+      },
+      {
+        "given": "Stefano",
+        "family": "Toppo"
+      },
+      {
+        "given": "Enrico",
+        "family": "Lavezzo"
+      },
+      {
+        "given": "Marco",
+        "family": "Falda"
+      },
+      {
+        "given": "Michele",
+        "family": "Berselli"
+      },
+      {
+        "given": "Silvio C.E.",
+        "family": "Tosatto"
+      },
+      {
+        "given": "Marco",
+        "family": "Carraro"
+      },
+      {
+        "given": "Damiano",
+        "family": "Piovesan"
+      },
+      {
+        "given": "Hafeez",
+        "family": "Ur Rehman"
+      },
+      {
+        "given": "Qizhong",
+        "family": "Mao"
+      },
+      {
+        "given": "Shanshan",
+        "family": "Zhang"
+      },
+      {
+        "given": "Slobodan",
+        "family": "Vucetic"
+      },
+      {
+        "given": "Gage S.",
+        "family": "Black"
+      },
+      {
+        "given": "Dane",
+        "family": "Jo"
+      },
+      {
+        "given": "Erica",
+        "family": "Suh"
+      },
+      {
+        "given": "Jonathan B.",
+        "family": "Dayton"
+      },
+      {
+        "given": "Dallas J.",
+        "family": "Larsen"
+      },
+      {
+        "given": "Ashton R.",
+        "family": "Omdahl"
+      },
+      {
+        "given": "Liam J.",
+        "family": "McGuffin"
+      },
+      {
+        "given": "Danielle A.",
+        "family": "Brackenridge"
+      },
+      {
+        "given": "Patricia C.",
+        "family": "Babbitt"
+      },
+      {
+        "given": "Jeffrey M.",
+        "family": "Yunes"
+      },
+      {
+        "given": "Paolo",
+        "family": "Fontana"
+      },
+      {
+        "given": "Feng",
+        "family": "Zhang"
+      },
+      {
+        "given": "Shanfeng",
+        "family": "Zhu"
+      },
+      {
+        "given": "Ronghui",
+        "family": "You"
+      },
+      {
+        "given": "Zihan",
+        "family": "Zhang"
+      },
+      {
+        "given": "Suyang",
+        "family": "Dai"
+      },
+      {
+        "given": "Shuwei",
+        "family": "Yao"
+      },
+      {
+        "given": "Weidong",
+        "family": "Tian"
+      },
+      {
+        "given": "Renzhi",
+        "family": "Cao"
+      },
+      {
+        "given": "Caleb",
+        "family": "Chandler"
+      },
+      {
+        "given": "Miguel",
+        "family": "Amezola"
+      },
+      {
+        "given": "Devon",
+        "family": "Johnson"
+      },
+      {
+        "given": "Jia-Ming",
+        "family": "Chang"
+      },
+      {
+        "given": "Wen-Hung",
+        "family": "Liao"
+      },
+      {
+        "given": "Yi-Wei",
+        "family": "Liu"
+      },
+      {
+        "given": "Stefano",
+        "family": "Pascarelli"
+      },
+      {
+        "given": "Yotam",
+        "family": "Frank"
+      },
+      {
+        "given": "Robert",
+        "family": "Hoehndorf"
+      },
+      {
+        "given": "Maxat",
+        "family": "Kulmanov"
+      },
+      {
+        "given": "Imane",
+        "family": "Boudellioua"
+      },
+      {
+        "given": "Gianfranco",
+        "family": "Politano"
+      },
+      {
+        "given": "Stefano",
+        "family": "Di Carlo"
+      },
+      {
+        "given": "Alfredo",
+        "family": "Benso"
+      },
+      {
+        "given": "Kai",
+        "family": "Hakala"
+      },
+      {
+        "given": "Filip",
+        "family": "Ginter"
+      },
+      {
+        "given": "Farrokh",
+        "family": "Mehryary"
+      },
+      {
+        "given": "Suwisa",
+        "family": "Kaewphan"
+      },
+      {
+        "given": "Jari",
+        "family": "Björne"
+      },
+      {
+        "given": "Hans",
+        "family": "Moen"
+      },
+      {
+        "given": "Martti E.E.",
+        "family": "Tolvanen"
+      },
+      {
+        "given": "Tapio",
+        "family": "Salakoski"
+      },
+      {
+        "given": "Daisuke",
+        "family": "Kihara"
+      },
+      {
+        "given": "Aashish",
+        "family": "Jain"
+      },
+      {
+        "given": "Tomislav",
+        "family": "Šmuc"
+      },
+      {
+        "given": "Adrian",
+        "family": "Altenhoff"
+      },
+      {
+        "given": "Asa",
+        "family": "Ben-Hur"
+      },
+      {
+        "given": "Burkhard",
+        "family": "Rost"
+      },
+      {
+        "given": "Steven E.",
+        "family": "Brenner"
+      },
+      {
+        "given": "Christine A.",
+        "family": "Orengo"
+      },
+      {
+        "given": "Constance J.",
+        "family": "Jeffery"
+      },
+      {
+        "given": "Giovanni",
+        "family": "Bosco"
+      },
+      {
+        "given": "Deborah A.",
+        "family": "Hogan"
+      },
+      {
+        "given": "Maria J.",
+        "family": "Martin"
+      },
+      {
+        "given": "Claire",
+        "family": "O’Donovan"
+      },
+      {
+        "given": "Sean D.",
+        "family": "Mooney"
+      },
+      {
+        "given": "Casey S.",
+        "family": "Greene"
+      },
+      {
+        "given": "Predrag",
+        "family": "Radivojac"
+      },
+      {
+        "given": "Iddo",
+        "family": "Friedberg"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          11,
+          19
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ggnxpz",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC6864930",
+    "PMID": "31744546",
+    "id": "DN7TyZzb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-019-1835-8"
+  },
+  {
+    "publisher": "Springer Science and Business Media LLC",
+    "issue": "1",
+    "abstract": "<jats:title>Abstract</jats:title><jats:sec>\n                <jats:title>Background</jats:title>\n                <jats:p>Polygenic risk scores (PRS) are valuable to translate the results of genome-wide association studies (GWAS) into clinical practice. To date, most GWAS have been based on individuals of European-ancestry leading to poor performance in populations of non-European ancestry.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Results</jats:title>\n                <jats:p>We introduce the polygenic transcriptome risk score (PTRS), which is based on predicted transcript levels (rather than SNPs), and explore the portability of PTRS across populations using UK Biobank data.</jats:p>\n              </jats:sec><jats:sec>\n                <jats:title>Conclusions</jats:title>\n                <jats:p>We show that PTRS has a significantly higher portability (Wilcoxon <jats:italic>p</jats:italic>=0.013) in the African-descent samples where the loss of performance is most acute with better performance than PRS when used in combination.</jats:p>\n              </jats:sec>",
+    "DOI": "10.1186/s13059-021-02591-w",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Polygenic transcriptome risk scores (PTRS) can improve portability of polygenic risk scores across ancestries",
+    "volume": "23",
+    "author": [
+      {
+        "given": "Yanyu",
+        "family": "Liang"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Ani",
+        "family": "Manichaikul"
+      },
+      {
+        "given": "Abraham A.",
+        "family": "Palmer"
+      },
+      {
+        "given": "Nancy J.",
+        "family": "Cox"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "Genome Biology",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2022,
+          1,
+          13
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gqtdvn",
+    "container-title-short": "Genome Biol",
+    "PMCID": "PMC8759285",
+    "PMID": "35027082",
+    "id": "hSYqogYZ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1186/s13059-021-02591-w"
+  },
+  {
+    "publisher": "Royal College of Psychiatrists",
+    "issue": "3",
+    "abstract": "<jats:sec><jats:title>Background</jats:title><jats:p>Bipolar disorder and schizophrenia are associated with increased mortality relative to the general population. There is an international emphasis on decreasing this excess mortality.</jats:p></jats:sec><jats:sec><jats:title>Aims</jats:title><jats:p>To determine whether the mortality gap between individuals with bipolar disorder and schizophrenia and the general population has decreased.</jats:p></jats:sec><jats:sec><jats:title>Method</jats:title><jats:p>A nationally representative cohort study using primary care electronic health records from 2000 to 2014, comparing all patients diagnosed with bipolar disorder or schizophrenia and the general population. The primary outcome was all-cause mortality.</jats:p></jats:sec><jats:sec><jats:title>Results</jats:title><jats:p>Individuals with bipolar disorder and schizophrenia had elevated mortality (adjusted hazard ratio (HR) = 1.79, 95% CI 1.67–1.88 and 2.08, 95% CI 1.98–2.19 respectively). Adjusted HRs for bipolar disorder increased by 0.14/year (95% CI 0.10–0.19) from 2006 to 2014. The adjusted HRs for schizophrenia increased gradually from 2004 to 2010 (0.11/year, 95% CI 0.04–0.17) and rapidly after 2010 (0.34/year, 95% CI 0.18–0.49).</jats:p></jats:sec><jats:sec><jats:title>Conclusions</jats:title><jats:p>The mortality gap between individuals with bipolar disorder and schizophrenia, and the general population is widening.</jats:p></jats:sec>",
+    "DOI": "10.1192/bjp.bp.117.202606",
+    "type": "article-journal",
+    "page": "175-181",
+    "source": "Crossref",
+    "title": "Mortality gap for people with bipolar disorder and schizophrenia: UK-based cohort study 2000–2014",
+    "volume": "211",
+    "author": [
+      {
+        "given": "Joseph F.",
+        "family": "Hayes"
+      },
+      {
+        "given": "Louise",
+        "family": "Marston"
+      },
+      {
+        "given": "Kate",
+        "family": "Walters"
+      },
+      {
+        "given": "Michael B.",
+        "family": "King"
+      },
+      {
+        "given": "David P. J.",
+        "family": "Osborn"
+      }
+    ],
+    "container-title": "British Journal of Psychiatry",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          9
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gbwcjx",
+    "container-title-short": "Br J Psychiatry",
+    "PMCID": "PMC5579328",
+    "PMID": "28684403",
+    "id": "17LYMnG9n",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1192/bjp.bp.117.202606"
+  },
+  {
+    "publisher": "Elsevier BV",
+    "issue": "4",
+    "DOI": "10.1194/jlr.s092007",
+    "type": "article-journal",
+    "page": "741-746",
+    "source": "Crossref",
+    "title": "Niacin: an old lipid drug in a new NAD+ dress",
+    "volume": "60",
+    "author": [
+      {
+        "given": "Mario",
+        "family": "Romani"
+      },
+      {
+        "given": "Dina Carina",
+        "family": "Hofer"
+      },
+      {
+        "given": "Elena",
+        "family": "Katsyuba"
+      },
+      {
+        "given": "Johan",
+        "family": "Auwerx"
+      }
+    ],
+    "container-title": "Journal of Lipid Research",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpjft",
+    "container-title-short": "Journal of Lipid Research",
+    "PMCID": "PMC6446705",
+    "PMID": "30782960",
+    "id": "7OgaYjeL",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1194/jlr.s092007"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "4",
+    "DOI": "10.1371/journal.pcbi.1004219",
+    "type": "article-journal",
+    "page": "e1004219",
+    "source": "Crossref",
+    "title": "MAGMA: Generalized Gene-Set Analysis of GWAS Data",
+    "volume": "11",
+    "author": [
+      {
+        "given": "Christiaan A.",
+        "family": "de Leeuw"
+      },
+      {
+        "given": "Joris M.",
+        "family": "Mooij"
+      },
+      {
+        "given": "Tom",
+        "family": "Heskes"
+      },
+      {
+        "given": "Danielle",
+        "family": "Posthuma"
+      }
+    ],
+    "container-title": "PLOS Computational Biology",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Hua",
+        "family": "Tang"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          4,
+          17
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gf92gp",
+    "container-title-short": "PLoS Comput Biol",
+    "PMCID": "PMC4401657",
+    "PMID": "25885710",
+    "id": "19XiXgYmd",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pcbi.1004219"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "1",
+    "DOI": "10.1371/journal.pgen.1007889",
+    "type": "article-journal",
+    "page": "e1007889",
+    "source": "Crossref",
+    "title": "Integrating predicted transcriptome from multiple tissues improves association detection",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Alvaro N.",
+        "family": "Barbeira"
+      },
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Jiamao",
+        "family": "Zheng"
+      },
+      {
+        "given": "Heather E.",
+        "family": "Wheeler"
+      },
+      {
+        "given": "Dan L.",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Vincent",
+        "family": "Plagnol"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          1,
+          22
+        ]
+      ]
+    },
+    "URL": "https://doi.org/ghs8vx",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC6358100",
+    "PMID": "30668570",
+    "id": "1FFzCXo1s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1007889"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "12",
+    "DOI": "10.1371/journal.pgen.1008489",
+    "type": "article-journal",
+    "page": "e1008489",
+    "source": "Crossref",
+    "title": "Are drug targets with genetic support twice as likely to be approved? Revised estimates of the impact of genetic support for drug mechanisms on the probability of drug approval",
+    "volume": "15",
+    "author": [
+      {
+        "given": "Emily A.",
+        "family": "King"
+      },
+      {
+        "given": "J. Wade",
+        "family": "Davis"
+      },
+      {
+        "given": "Jacob F.",
+        "family": "Degner"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Jonathan",
+        "family": "Marchini"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          12,
+          12
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gg957r",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC6907751",
+    "PMID": "31830040",
+    "id": "PgEwSS4Q",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1008489"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "4",
+    "abstract": "<jats:p>Transcriptome-wide association studies (TWAS) have been widely used to integrate transcriptomic and genetic data to study complex human diseases. Within a test dataset lacking transcriptomic data, traditional two-stage TWAS methods first impute gene expression by creating a weighted sum that aggregates SNPs with their corresponding cis-eQTL effects on reference transcriptome. Traditional TWAS methods then employ a linear regression model to assess the association between imputed gene expression and test phenotype, thereby assuming the effect of a cis-eQTL SNP on test phenotype is a linear function of the eQTL’s estimated effect on reference transcriptome. To increase TWAS robustness to this assumption, we propose a novel Variance-Component TWAS procedure (VC-TWAS) that assumes the effects of cis-eQTL SNPs on phenotype are random (with variance proportional to corresponding reference cis-eQTL effects) rather than fixed. VC-TWAS is applicable to both continuous and dichotomous phenotypes, as well as individual-level and summary-level GWAS data. Using simulated data, we show VC-TWAS is more powerful than traditional TWAS methods based on a two-stage Burden test, especially when eQTL genetic effects on test phenotype are no longer a linear function of their eQTL genetic effects on reference transcriptome. We further applied VC-TWAS to both individual-level (N = ~3.4K) and summary-level (N = ~54K) GWAS data to study Alzheimer’s dementia (AD). With the individual-level data, we detected 13 significant risk genes including 6 known GWAS risk genes such as <jats:italic>TOMM40</jats:italic> that were missed by traditional TWAS methods. With the summary-level data, we detected 57 significant risk genes considering only cis-SNPs and 71 significant genes considering both cis- and trans- SNPs, which also validated our findings with the individual-level GWAS data. Our VC-TWAS method is implemented in the TIGAR tool for public use.</jats:p>",
+    "DOI": "10.1371/journal.pgen.1009482",
+    "type": "article-journal",
+    "page": "e1009482",
+    "source": "Crossref",
+    "title": "Novel Variance-Component TWAS method for studying complex human diseases with applications to Alzheimer’s dementia",
+    "volume": "17",
+    "author": [
+      {
+        "given": "Shizhen",
+        "family": "Tang"
+      },
+      {
+        "given": "Aron S.",
+        "family": "Buchman"
+      },
+      {
+        "given": "Philip L.",
+        "family": "De Jager"
+      },
+      {
+        "given": "David A.",
+        "family": "Bennett"
+      },
+      {
+        "given": "Michael P.",
+        "family": "Epstein"
+      },
+      {
+        "given": "Jingjing",
+        "family": "Yang"
+      }
+    ],
+    "container-title": "PLOS Genetics",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Lin",
+        "family": "Chen"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          4,
+          2
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gjpr3j",
+    "container-title-short": "PLoS Genet",
+    "PMCID": "PMC8046351",
+    "PMID": "33798195",
+    "id": "yEdRP9Xx",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pgen.1009482"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "10",
+    "DOI": "10.1371/journal.pone.0109760",
+    "type": "article-journal",
+    "page": "e109760",
+    "source": "Crossref",
+    "title": "Copy Number Loss of the Interferon Gene Cluster in Melanomas Is Linked to Reduced T Cell Infiltrate and Poor Patient Prognosis",
+    "volume": "9",
+    "author": [
+      {
+        "given": "Peter S.",
+        "family": "Linsley"
+      },
+      {
+        "given": "Cate",
+        "family": "Speake"
+      },
+      {
+        "given": "Elizabeth",
+        "family": "Whalen"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      }
+    ],
+    "container-title": "PLoS ONE",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Maria G.",
+        "family": "Castro"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10,
+          14
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gk9k8s",
+    "container-title-short": "PLoS ONE",
+    "PMCID": "PMC4196925",
+    "PMID": "25314013",
+    "id": "D5XBhzim",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pone.0109760"
+  },
+  {
+    "publisher": "Public Library of Science (PLoS)",
+    "issue": "1",
+    "DOI": "10.1371/journal.pone.0192082",
+    "type": "article-journal",
+    "page": "e0192082",
+    "source": "Crossref",
+    "title": "Shared and organism-specific host responses to childhood diarrheal diseases revealed by whole blood transcript profiling",
+    "volume": "13",
+    "author": [
+      {
+        "given": "Hannah A.",
+        "family": "DeBerg"
+      },
+      {
+        "given": "Mussaret B.",
+        "family": "Zaidi"
+      },
+      {
+        "given": "Matthew C.",
+        "family": "Altman"
+      },
+      {
+        "given": "Prasong",
+        "family": "Khaenam"
+      },
+      {
+        "given": "Vivian H.",
+        "family": "Gersuk"
+      },
+      {
+        "given": "Freddy D.",
+        "family": "Campos"
+      },
+      {
+        "given": "Iza",
+        "family": "Perez-Martinez"
+      },
+      {
+        "given": "Mario",
+        "family": "Meza-Segura"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Jacques",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Teresa",
+        "family": "Estrada-Garcia"
+      },
+      {
+        "given": "Peter S.",
+        "family": "Linsley"
+      }
+    ],
+    "container-title": "PLOS ONE",
+    "language": "en",
+    "editor": [
+      {
+        "given": "Karol",
+        "family": "Sestak"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2018,
+          1,
+          29
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gcwgcr",
+    "container-title-short": "PLoS ONE",
+    "PMCID": "PMC5788382",
+    "PMID": "29377961",
+    "id": "RliFvowC",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.1371/journal.pone.0192082"
+  },
+  {
+    "publisher": "Frontiers Media SA",
+    "DOI": "10.3389/fphys.2020.00393",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Lipid and Lipoprotein Metabolism in Microglia",
+    "volume": "11",
+    "author": [
+      {
+        "given": "Bailey A.",
+        "family": "Loving"
+      },
+      {
+        "given": "Kimberley D.",
+        "family": "Bruce"
+      }
+    ],
+    "container-title": "Frontiers in Physiology",
+    "issued": {
+      "date-parts": [
+        [
+          2020,
+          4,
+          28
+        ]
+      ]
+    },
+    "URL": "https://doi.org/gk92xd",
+    "container-title-short": "Front. Physiol.",
+    "PMCID": "PMC7198855",
+    "PMID": "32411016",
+    "id": "18I4ish9s",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.3389/fphys.2020.00393"
+  },
+  {
+    "publisher": "MDPI AG",
+    "issue": "7",
+    "DOI": "10.3390/v5071664",
+    "type": "article-journal",
+    "page": "1664-1681",
+    "source": "Crossref",
+    "title": "Identification of Genes Critical for Resistance to Infection by West Nile Virus Using RNA-Seq Analysis",
+    "volume": "5",
+    "author": [
+      {
+        "given": "Feng",
+        "family": "Qian"
+      },
+      {
+        "given": "Lisa",
+        "family": "Chung"
+      },
+      {
+        "given": "Wei",
+        "family": "Zheng"
+      },
+      {
+        "given": "Vincent",
+        "family": "Bruno"
+      },
+      {
+        "given": "Roger",
+        "family": "Alexander"
+      },
+      {
+        "given": "Zhong",
+        "family": "Wang"
+      },
+      {
+        "given": "Xiaomei",
+        "family": "Wang"
+      },
+      {
+        "given": "Sebastian",
+        "family": "Kurscheid"
+      },
+      {
+        "given": "Hongyu",
+        "family": "Zhao"
+      },
+      {
+        "given": "Erol",
+        "family": "Fikrig"
+      },
+      {
+        "given": "Mark",
+        "family": "Gerstein"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      },
+      {
+        "given": "Ruth",
+        "family": "Montgomery"
+      }
+    ],
+    "container-title": "Viruses",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          7,
+          8
+        ]
+      ]
+    },
+    "URL": "https://doi.org/f49d7g",
+    "container-title-short": "Viruses",
+    "PMCID": "PMC3738954",
+    "PMID": "23881275",
+    "id": "mtMYROCN",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.3390/v5071664"
+  },
+  {
+    "type": "article",
+    "id": "1DJZvtwP1",
+    "categories": [
+      "LINCS",
+      "L1000",
+      "consensus",
+      "database",
+      "perturbation",
+      "Rephetio"
+    ],
+    "author": [
+      {
+        "family": "Himmelstein",
+        "given": "Daniel"
+      },
+      {
+        "family": "Brueggeman",
+        "given": "Leo"
+      },
+      {
+        "family": "Baranzini",
+        "given": "Sergio"
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          3,
+          8
+        ]
+      ]
+    },
+    "abstract": "This repository creates user-friendly datasets for LINCS L1000. We extend the L1000 data offerings with consensus signatures, compound mappings, and chemical similarities. Read about this release on Thinklab.",
+    "DOI": "10.5281/zenodo.47223",
+    "publisher": "Zenodo",
+    "title": "Dhimmel/Lincs V2.0: Refined Consensus Signatures From Lincs L1000",
+    "URL": "https://doi.org/f3mqvr",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.5281/zenodo.47223"
+  },
+  {
+    "type": "article",
+    "id": "10KA5jTBQ",
+    "categories": [
+      "indications",
+      "PharmacotherapyDB",
+      "disease modifying",
+      "drugs",
+      "disease",
+      "Rephetio"
+    ],
+    "author": [
+      {
+        "family": "Himmelstein",
+        "given": "Daniel S."
+      },
+      {
+        "literal": "Pouya Khankhanian"
+      },
+      {
+        "family": "Hessler",
+        "given": "Christine S."
+      },
+      {
+        "family": "Green",
+        "given": "Ari J."
+      },
+      {
+        "family": "Baranzini",
+        "given": "Sergio E."
+      }
+    ],
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          3,
+          15
+        ]
+      ]
+    },
+    "abstract": "This is the repository for the initial release of our catalog of drug therapies for disease. The catalog, named PharmacotherapyDB, contains physician curated medical indications. The data for this release is also on figshare\n\nThis initial release contains 97 diseases and 601 drugs. Between these drug–disease pairs, there are 755 disease-modifying therapies, 390 symptomatic therapies, and 243 non-indications. To enable integrative analyses, drugs and diseases are coded using DrugBank and Disease Ontology identifiers.\n\nThe catalog adheres to pathophysiological principals first. Therefore, the catalog includes indications with a poor risk–benefit ratio that are rarely used in the modern clinic. Contributions are welcome as we hope to expand and refine the catalog over time.\n\nRead more on Thinklab.",
+    "DOI": "10.5281/zenodo.47664",
+    "publisher": "Zenodo",
+    "title": "Dhimmel/Indications V1.0. Pharmacotherapydb: The Open Catalog Of Drug Therapies For Disease",
+    "URL": "https://doi.org/f3mqwb",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.5281/zenodo.47664"
+  },
+  {
+    "publisher": "eLife Sciences Publications, Ltd",
+    "abstract": "<jats:p>The ability to computationally predict whether a compound treats a disease would improve the economy and success rate of drug approval. This study describes Project Rephetio to systematically model drug efficacy based on 755 existing treatments. First, we constructed Hetionet (neo4j.het.io), an integrative network encoding knowledge from millions of biomedical studies. Hetionet v1.0 consists of 47,031 nodes of 11 types and 2,250,197 relationships of 24 types. Data were integrated from 29 public resources to connect compounds, diseases, genes, anatomies, pathways, biological processes, molecular functions, cellular components, pharmacologic classes, side effects, and symptoms. Next, we identified network patterns that distinguish treatments from non-treatments. Then, we predicted the probability of treatment for 209,168 compound–disease pairs (het.io/repurpose). Our predictions validated on two external sets of treatment and provided pharmacological insights on epilepsy, suggesting they will help prioritize drug repurposing candidates. This study was entirely open and received realtime feedback from 40 community members.</jats:p>",
+    "DOI": "10.7554/elife.26726",
+    "type": "article-journal",
+    "source": "Crossref",
+    "title": "Systematic integration of biomedical knowledge prioritizes drugs for repurposing",
+    "volume": "6",
+    "author": [
+      {
+        "given": "Daniel Scott",
+        "family": "Himmelstein"
+      },
+      {
+        "given": "Antoine",
+        "family": "Lizee"
+      },
+      {
+        "given": "Christine",
+        "family": "Hessler"
+      },
+      {
+        "given": "Leo",
+        "family": "Brueggeman"
+      },
+      {
+        "given": "Sabrina L",
+        "family": "Chen"
+      },
+      {
+        "given": "Dexter",
+        "family": "Hadley"
+      },
+      {
+        "given": "Ari",
+        "family": "Green"
+      },
+      {
+        "given": "Pouya",
+        "family": "Khankhanian"
+      },
+      {
+        "given": "Sergio E",
+        "family": "Baranzini"
+      }
+    ],
+    "container-title": "eLife",
+    "language": "en",
+    "issued": {
+      "date-parts": [
+        [
+          2017,
+          9,
+          22
+        ]
+      ]
+    },
+    "URL": "https://doi.org/cdfk",
+    "PMCID": "PMC5640425",
+    "PMID": "28936969",
+    "id": "O21tn8vf",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: doi:10.7554/elife.26726"
+  },
+  {
+    "title": "Depression as a predictor for coronary heart disease. a review and meta-analysis.",
+    "volume": "23",
+    "issue": "1",
+    "page": "51-61",
+    "container-title": "American journal of preventive medicine",
+    "container-title-short": "Am J Prev Med",
+    "ISSN": "0749-3797",
+    "issued": {
+      "date-parts": [
+        [
+          2002,
+          7
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Reiner",
+        "family": "Rugulies"
+      }
+    ],
+    "PMID": "12093424",
+    "DOI": "10.1016/s0749-3797(02)00439-7",
+    "abstract": "To review and quantify the impact of depression on the development of coronary heart disease (CHD) in initially healthy subjects.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/12093424",
+    "type": "article-journal",
+    "id": "10qjLoufR",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:12093424"
+  },
+  {
+    "title": "Elevated rates of protein secretion, evolution, and disease among tissue-specific genes.",
+    "volume": "14",
+    "issue": "1",
+    "page": "54-61",
+    "container-title": "Genome research",
+    "container-title-short": "Genome Res",
+    "ISSN": "1088-9051",
+    "issued": {
+      "date-parts": [
+        [
+          2004,
+          1
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Eitan E",
+        "family": "Winter"
+      },
+      {
+        "given": "Leo",
+        "family": "Goodstadt"
+      },
+      {
+        "given": "Chris P",
+        "family": "Ponting"
+      }
+    ],
+    "PMID": "14707169",
+    "PMCID": "PMC314278",
+    "DOI": "10.1101/gr.1924004",
+    "abstract": "Variation in gene expression has been held responsible for the functional and morphological specialization of tissues. The tissue specificity of genes is known to correlate positively with gene evolution rates. We show here, using large data sets, that when a gene is expressed highly in a small number of tissues, its protein is more likely to be secreted and more likely to be mutated in genetic diseases with Mendelian inheritance. We find that secreted proteins are evolving at faster rates than nonsecreted proteins, and that their evolutionary rates are highly correlated with tissue specificity. However, the impact of secretion on evolutionary rates is countered by tissue-specific constraints that have been held constant over the past 75 million years. We find that disease genes are underrepresented among intracellular and slowly evolving housekeeping genes. These findings illuminate major selective pressures that have shaped the gene repertoires expressed in different mammalian tissues.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/14707169",
+    "type": "article-journal",
+    "id": "18jYvPauB",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:14707169"
+  },
+  {
+    "title": "How does gene expression clustering work?",
+    "volume": "23",
+    "issue": "12",
+    "page": "1499-501",
+    "container-title": "Nature biotechnology",
+    "container-title-short": "Nat Biotechnol",
+    "ISSN": "1087-0156",
+    "issued": {
+      "date-parts": [
+        [
+          2005,
+          12
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Patrik",
+        "family": "D'haeseleer"
+      }
+    ],
+    "PMID": "16333293",
+    "DOI": "10.1038/nbt1205-1499",
+    "abstract": "Clustering is often one of the first steps in gene expression analysis. How do clustering algorithms work, which ones should we use and what can we expect from them?",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/16333293",
+    "type": "article-journal",
+    "id": "VzZoy0BD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:16333293"
+  },
+  {
+    "title": "A modular analysis framework for blood genomics studies: application to systemic lupus erythematosus.",
+    "volume": "29",
+    "issue": "1",
+    "page": "150-64",
+    "container-title": "Immunity",
+    "container-title-short": "Immunity",
+    "ISSN": "1097-4180",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          7,
+          18
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Charles",
+        "family": "Quinn"
+      },
+      {
+        "given": "Jing",
+        "family": "Shen"
+      },
+      {
+        "given": "Pinakeen",
+        "family": "Patel"
+      },
+      {
+        "given": "Casey",
+        "family": "Glaser"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      },
+      {
+        "given": "Dorothee",
+        "family": "Stichweh"
+      },
+      {
+        "given": "Derek",
+        "family": "Blankenship"
+      },
+      {
+        "given": "Lei",
+        "family": "Li"
+      },
+      {
+        "given": "Indira",
+        "family": "Munagala"
+      },
+      {
+        "given": "Lynda",
+        "family": "Bennett"
+      },
+      {
+        "given": "Florence",
+        "family": "Allantaz"
+      },
+      {
+        "given": "Asuncion",
+        "family": "Mejias"
+      },
+      {
+        "given": "Monica",
+        "family": "Ardura"
+      },
+      {
+        "given": "Ellen",
+        "family": "Kaizer"
+      },
+      {
+        "given": "Laurence",
+        "family": "Monnet"
+      },
+      {
+        "given": "Windy",
+        "family": "Allman"
+      },
+      {
+        "given": "Henry",
+        "family": "Randall"
+      },
+      {
+        "given": "Diane",
+        "family": "Johnson"
+      },
+      {
+        "given": "Aimee",
+        "family": "Lanier"
+      },
+      {
+        "given": "Marilynn",
+        "family": "Punaro"
+      },
+      {
+        "given": "Knut M",
+        "family": "Wittkowski"
+      },
+      {
+        "given": "Perrin",
+        "family": "White"
+      },
+      {
+        "given": "Joseph",
+        "family": "Fay"
+      },
+      {
+        "given": "Goran",
+        "family": "Klintmalm"
+      },
+      {
+        "given": "Octavio",
+        "family": "Ramilo"
+      },
+      {
+        "given": "A Karolina",
+        "family": "Palucka"
+      },
+      {
+        "given": "Jacques",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Virginia",
+        "family": "Pascual"
+      }
+    ],
+    "PMID": "18631455",
+    "PMCID": "PMC2727981",
+    "DOI": "10.1016/j.immuni.2008.05.012",
+    "abstract": "The analysis of patient blood transcriptional profiles offers a means to investigate the immunological mechanisms relevant to human diseases on a genome-wide scale. In addition, such studies provide a basis for the discovery of clinically relevant biomarker signatures. We designed a strategy for microarray analysis that is based on the identification of transcriptional modules formed by genes coordinately expressed in multiple disease data sets. Mapping changes in gene expression at the module level generated disease-specific transcriptional fingerprints that provide a stable framework for the visualization and functional interpretation of microarray data. These transcriptional modules were used as a basis for the selection of biomarkers and the development of a multivariate transcriptional indicator of disease progression in patients with systemic lupus erythematosus. Thus, this work describes the implementation and application of a methodology designed to support systems-scale analysis of the human immune system in translational research settings.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/18631455",
+    "type": "article-journal",
+    "id": "S56q1qoc",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:18631455"
+  },
+  {
+    "title": "A large-scale analysis of tissue-specific pathology and gene expression of human disease genes and complexes.",
+    "volume": "105",
+    "issue": "52",
+    "page": "20870-5",
+    "container-title": "Proceedings of the National Academy of Sciences of the United States of America",
+    "container-title-short": "Proc Natl Acad Sci U S A",
+    "ISSN": "1091-6490",
+    "issued": {
+      "date-parts": [
+        [
+          2008,
+          12,
+          22
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Kasper",
+        "family": "Lage"
+      },
+      {
+        "given": "Niclas Tue",
+        "family": "Hansen"
+      },
+      {
+        "given": "E Olof",
+        "family": "Karlberg"
+      },
+      {
+        "given": "Aron C",
+        "family": "Eklund"
+      },
+      {
+        "given": "Francisco S",
+        "family": "Roque"
+      },
+      {
+        "given": "Patricia K",
+        "family": "Donahoe"
+      },
+      {
+        "given": "Zoltan",
+        "family": "Szallasi"
+      },
+      {
+        "given": "Thomas Skøt",
+        "family": "Jensen"
+      },
+      {
+        "given": "Søren",
+        "family": "Brunak"
+      }
+    ],
+    "PMID": "19104045",
+    "PMCID": "PMC2606902",
+    "DOI": "10.1073/pnas.0810772105",
+    "abstract": "Heritable diseases are caused by germ-line mutations that, despite tissuewide presence, often lead to tissue-specific pathology. Here, we make a systematic analysis of the link between tissue-specific gene expression and pathological manifestations in many human diseases and cancers. Diseases were systematically mapped to tissues they affect from disease-relevant literature in PubMed to create a disease-tissue covariation matrix of high-confidence associations of >1,000 diseases to 73 tissues. By retrieving >2,000 known disease genes, and generating 1,500 disease-associated protein complexes, we analyzed the differential expression of a gene or complex involved in a particular disease in the tissues affected by the disease, compared with nonaffected tissues. When this analysis is scaled to all diseases in our dataset, there is a significant tendency for disease genes and complexes to be overexpressed in the normal tissues where defects cause pathology. In contrast, cancer genes and complexes were not overexpressed in the tissues from which the tumors emanate. We specifically identified a complex involved in XY sex reversal that is testis-specific and down-regulated in ovaries. We also identified complexes in Parkinson disease, cardiomyopathies, and muscular dystrophy syndromes that are similarly tissue specific. Our method represents a conceptual scaffold for organism-spanning analyses and reveals an extensive list of tissue-specific draft molecular pathways, both known and unexpected, that might be disrupted in disease.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/19104045",
+    "type": "article-journal",
+    "id": "e0tRKjE5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:19104045"
+  },
+  {
+    "title": "Relaxed purifying selection and possibly high rate of adaptation in primate lineage-specific genes.",
+    "volume": "2",
+    "page": "393-409",
+    "container-title": "Genome biology and evolution",
+    "container-title-short": "Genome Biol Evol",
+    "ISSN": "1759-6653",
+    "issued": {
+      "date-parts": [
+        [
+          2010,
+          7,
+          12
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "James J",
+        "family": "Cai"
+      },
+      {
+        "given": "Dmitri A",
+        "family": "Petrov"
+      }
+    ],
+    "PMID": "20624743",
+    "PMCID": "PMC2997544",
+    "DOI": "10.1093/gbe/evq019",
+    "abstract": "Genes in the same organism vary in the time since their evolutionary origin. Without horizontal gene transfer, young genes are necessarily restricted to a few closely related species, whereas old genes can be broadly distributed across the phylogeny. It has been shown that young genes evolve faster than old genes; however, the evolutionary forces responsible for this pattern remain obscure. Here, we classify human-chimp protein-coding genes into different age classes, according to the breath of their phylogenetic distribution. We estimate the strength of purifying selection and the rate of adaptive selection for genes in different age classes. We find that older genes carry fewer and less frequent nonsynonymous single-nucleotide polymorphisms than younger genes suggesting that older genes experience a stronger purifying selection at the protein-coding level. We infer the distribution of fitness effects of new deleterious mutations and find that older genes have proportionally more slightly deleterious mutations and fewer nearly neutral mutations than younger genes. To investigate the role of adaptive selection of genes in different age classes, we determine the selection coefficient (gamma = 2N(e)s) of genes using the MKPRF approach and estimate the ratio of the rate of adaptive nonsynonymous substitution to synonymous substitution (omega(A)) using the DoFE method. Although the proportion of positively selected genes (gamma > 0) is significantly higher in younger genes, we find no correlation between omega(A) and gene age. Collectively, these results provide strong evidence that younger genes are subject to weaker purifying selection and more tenuous evidence that they also undergo adaptive evolution more frequently.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/20624743",
+    "type": "article-journal",
+    "id": "O0e3EhY6",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:20624743"
+  },
+  {
+    "title": "Niacin in patients with low HDL cholesterol levels receiving intensive statin therapy.",
+    "volume": "365",
+    "issue": "24",
+    "page": "2255-67",
+    "container-title": "The New England journal of medicine",
+    "container-title-short": "N Engl J Med",
+    "ISSN": "1533-4406",
+    "issued": {
+      "date-parts": [
+        [
+          2011,
+          11,
+          15
+        ]
+      ]
+    },
+    "author": [
+      {},
+      {
+        "given": "William E",
+        "family": "Boden"
+      },
+      {
+        "given": "Jeffrey L",
+        "family": "Probstfield"
+      },
+      {
+        "given": "Todd",
+        "family": "Anderson"
+      },
+      {
+        "given": "Bernard R",
+        "family": "Chaitman"
+      },
+      {
+        "given": "Patrice",
+        "family": "Desvignes-Nickens"
+      },
+      {
+        "given": "Kent",
+        "family": "Koprowicz"
+      },
+      {
+        "given": "Ruth",
+        "family": "McBride"
+      },
+      {
+        "given": "Koon",
+        "family": "Teo"
+      },
+      {
+        "given": "William",
+        "family": "Weintraub"
+      }
+    ],
+    "PMID": "22085343",
+    "DOI": "10.1056/nejmoa1107579",
+    "abstract": "In patients with established cardiovascular disease, residual cardiovascular risk persists despite the achievement of target low-density lipoprotein (LDL) cholesterol levels with statin therapy. It is unclear whether extended-release niacin added to simvastatin to raise low levels of high-density lipoprotein (HDL) cholesterol is superior to simvastatin alone in reducing such residual risk.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/22085343",
+    "type": "article-journal",
+    "id": "bRPc66OD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:22085343"
+  },
+  {
+    "title": "Architecture of the human regulatory network derived from ENCODE data.",
+    "volume": "489",
+    "issue": "7414",
+    "page": "91-100",
+    "container-title": "Nature",
+    "container-title-short": "Nature",
+    "ISSN": "1476-4687",
+    "issued": {
+      "date-parts": [
+        [
+          2012,
+          9,
+          6
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Mark B",
+        "family": "Gerstein"
+      },
+      {
+        "given": "Anshul",
+        "family": "Kundaje"
+      },
+      {
+        "given": "Manoj",
+        "family": "Hariharan"
+      },
+      {
+        "given": "Stephen G",
+        "family": "Landt"
+      },
+      {
+        "given": "Koon-Kiu",
+        "family": "Yan"
+      },
+      {
+        "given": "Chao",
+        "family": "Cheng"
+      },
+      {
+        "given": "Xinmeng Jasmine",
+        "family": "Mu"
+      },
+      {
+        "given": "Ekta",
+        "family": "Khurana"
+      },
+      {
+        "given": "Joel",
+        "family": "Rozowsky"
+      },
+      {
+        "given": "Roger",
+        "family": "Alexander"
+      },
+      {
+        "given": "Renqiang",
+        "family": "Min"
+      },
+      {
+        "given": "Pedro",
+        "family": "Alves"
+      },
+      {
+        "given": "Alexej",
+        "family": "Abyzov"
+      },
+      {
+        "given": "Nick",
+        "family": "Addleman"
+      },
+      {
+        "given": "Nitin",
+        "family": "Bhardwaj"
+      },
+      {
+        "given": "Alan P",
+        "family": "Boyle"
+      },
+      {
+        "given": "Philip",
+        "family": "Cayting"
+      },
+      {
+        "given": "Alexandra",
+        "family": "Charos"
+      },
+      {
+        "given": "David Z",
+        "family": "Chen"
+      },
+      {
+        "given": "Yong",
+        "family": "Cheng"
+      },
+      {
+        "given": "Declan",
+        "family": "Clarke"
+      },
+      {
+        "given": "Catharine",
+        "family": "Eastman"
+      },
+      {
+        "given": "Ghia",
+        "family": "Euskirchen"
+      },
+      {
+        "given": "Seth",
+        "family": "Frietze"
+      },
+      {
+        "given": "Yao",
+        "family": "Fu"
+      },
+      {
+        "given": "Jason",
+        "family": "Gertz"
+      },
+      {
+        "given": "Fabian",
+        "family": "Grubert"
+      },
+      {
+        "given": "Arif",
+        "family": "Harmanci"
+      },
+      {
+        "given": "Preti",
+        "family": "Jain"
+      },
+      {
+        "given": "Maya",
+        "family": "Kasowski"
+      },
+      {
+        "given": "Phil",
+        "family": "Lacroute"
+      },
+      {
+        "given": "Jing Jane",
+        "family": "Leng"
+      },
+      {
+        "given": "Jin",
+        "family": "Lian"
+      },
+      {
+        "given": "Hannah",
+        "family": "Monahan"
+      },
+      {
+        "given": "Henriette",
+        "family": "O'Geen"
+      },
+      {
+        "given": "Zhengqing",
+        "family": "Ouyang"
+      },
+      {
+        "given": "E Christopher",
+        "family": "Partridge"
+      },
+      {
+        "given": "Dorrelyn",
+        "family": "Patacsil"
+      },
+      {
+        "given": "Florencia",
+        "family": "Pauli"
+      },
+      {
+        "given": "Debasish",
+        "family": "Raha"
+      },
+      {
+        "given": "Lucia",
+        "family": "Ramirez"
+      },
+      {
+        "given": "Timothy E",
+        "family": "Reddy"
+      },
+      {
+        "given": "Brian",
+        "family": "Reed"
+      },
+      {
+        "given": "Minyi",
+        "family": "Shi"
+      },
+      {
+        "given": "Teri",
+        "family": "Slifer"
+      },
+      {
+        "given": "Jing",
+        "family": "Wang"
+      },
+      {
+        "given": "Linfeng",
+        "family": "Wu"
+      },
+      {
+        "given": "Xinqiong",
+        "family": "Yang"
+      },
+      {
+        "given": "Kevin Y",
+        "family": "Yip"
+      },
+      {
+        "given": "Gili",
+        "family": "Zilberman-Schapira"
+      },
+      {
+        "given": "Serafim",
+        "family": "Batzoglou"
+      },
+      {
+        "given": "Arend",
+        "family": "Sidow"
+      },
+      {
+        "given": "Peggy J",
+        "family": "Farnham"
+      },
+      {
+        "given": "Richard M",
+        "family": "Myers"
+      },
+      {
+        "given": "Sherman M",
+        "family": "Weissman"
+      },
+      {
+        "given": "Michael",
+        "family": "Snyder"
+      }
+    ],
+    "PMID": "22955619",
+    "PMCID": "PMC4154057",
+    "DOI": "10.1038/nature11245",
+    "abstract": "Transcription factors bind in a combinatorial fashion to specify the on-and-off states of genes; the ensemble of these binding events forms a regulatory network, constituting the wiring diagram for a cell. To examine the principles of the human transcriptional regulatory network, we determined the genomic binding information of 119 transcription-related factors in over 450 distinct experiments. We found the combinatorial, co-association of transcription factors to be highly context specific: distinct combinations of factors bind at specific genomic locations. In particular, there are significant differences in the binding proximal and distal to genes. We organized all the transcription factor binding into a hierarchy and integrated it with other genomic information (for example, microRNA regulation), forming a dense meta-network. Factors at different levels have different properties; for instance, top-level transcription factors more strongly influence expression and middle-level ones co-regulate targets to mitigate information-flow bottlenecks. Moreover, these co-regulations give rise to many enriched network motifs (for example, noise-buffering feed-forward loops). Finally, more connected network components are under stronger selection and exhibit a greater degree of allele-specific activity (that is, differential binding to the two parental alleles). The regulatory information obtained in this study will be crucial for interpreting personal genome sequences and understanding basic principles of human biology and disease.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/22955619",
+    "type": "article-journal",
+    "id": "jrAMOJCD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:22955619"
+  },
+  {
+    "title": "The therapeutic role of niacin in dyslipidemia management.",
+    "volume": "19",
+    "issue": "2",
+    "page": "141-58",
+    "container-title": "Journal of cardiovascular pharmacology and therapeutics",
+    "container-title-short": "J Cardiovasc Pharmacol Ther",
+    "ISSN": "1940-4034",
+    "issued": {
+      "date-parts": [
+        [
+          2013,
+          12,
+          20
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "William E",
+        "family": "Boden"
+      },
+      {
+        "given": "Mandeep S",
+        "family": "Sidhu"
+      },
+      {
+        "given": "Peter P",
+        "family": "Toth"
+      }
+    ],
+    "PMID": "24363242",
+    "DOI": "10.1177/1074248413514481",
+    "abstract": "There is abundant epidemiologic evidence to support the independent, inverse relationship between low levels of high-density lipoprotein cholesterol (HDL-C) and incident cardiovascular (CV) risk, the clinical importance of which is underscored by the high prevalence of low HDL-C in populations with coronary heart disease (CHD), with or without elevated levels of low-density lipoprotein cholesterol (LDL-C). The National Cholesterol Education Program recommended that optimal treatment for high-risk patients includes both lowering LDL-C and non-HDL-C to risk stratified levels and raising HDL-C when it is <40 mg/dL, although no target level for the latter lipoprotein was suggested. Niacin is the most powerful agent currently available for raising low levels of HDL-C. It also induces significant reductions in triglycerides, lipoprotein(a), and LDL-C levels while also favorably altering LDL particle size and number. In the Coronary Drug Project, niacin treatment was associated with significant reductions in CV events and long-term mortality, similar to the reductions seen in the statin monotherapy trials. In combination trials, niacin plus a statin or bile acid sequestrant produces additive reductions in CHD morbidity and mortality and promotes regression of coronary atherosclerosis. Recently, 2 clinical outcome trials (Atherothrombosis Intervention in Metabolic Syndrome With Low HDL/High Triglycerides and Impact on Global Health Outcomes [AIM-HIGH] and Second Heart Protection Study [HPS-2 THRIVE]) failed to show a reduction in CV events in patients treated to optimally low levels of LDL-C. Despite favorable effects on HDL-C and triglycerides, these studies did not demonstrate incremental clinical benefit with niacin when added to simvastatin, although notable limitations were identified in each of these trials. Thus, there is insufficient evidence from clinical trials to recommend HDL-targeted therapy for additional event reduction at the present time. However, niacin should continue to be used as an adjuvant therapy for reducing atherogenic lipoprotein burden in patients who have not reached their risk stratified LDL-C and non-HDL-C targets. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24363242",
+    "type": "article-journal",
+    "id": "OOAkmacQ",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24363242"
+  },
+  {
+    "title": "Democratizing systems immunology with modular transcriptional repertoire analyses.",
+    "volume": "14",
+    "issue": "4",
+    "page": "271-80",
+    "container-title": "Nature reviews. Immunology",
+    "container-title-short": "Nat Rev Immunol",
+    "ISSN": "1474-1741",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          4
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      }
+    ],
+    "PMID": "24662387",
+    "PMCID": "PMC4118927",
+    "DOI": "10.1038/nri3642",
+    "abstract": "Individual elements that constitute the immune system have been characterized over the few past decades, mostly through reductionist approaches. The introduction of large-scale profiling platforms has more recently facilitated the assessment of these elements on a global scale. However, the analysis and the interpretation of such large-scale datasets remains a challenge and a barrier for the wider adoption of systems approaches in immunological and clinical studies. In this Innovation article, we describe an analytical strategy that relies on the a priori determination of co-dependent gene sets for a given biological system. Such modular transcriptional repertoires can in turn be used to simplify the analysis and the interpretation of large-scale datasets, and to design targeted immune fingerprinting assays and web applications that will further facilitate the dissemination of systems approaches in immunology.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24662387",
+    "type": "article-journal",
+    "id": "f2r8LLIn",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24662387"
+  },
+  {
+    "title": "High-density lipoproteins in the prevention of cardiovascular disease: changing the paradigm.",
+    "volume": "96",
+    "issue": "1",
+    "page": "48-56",
+    "container-title": "Clinical pharmacology and therapeutics",
+    "container-title-short": "Clin Pharmacol Ther",
+    "ISSN": "1532-6535",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          4,
+          8
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "S",
+        "family": "Tuteja"
+      },
+      {
+        "given": "D J",
+        "family": "Rader"
+      }
+    ],
+    "PMID": "24713591",
+    "DOI": "10.1038/clpt.2014.79",
+    "abstract": "High-density-lipoprotein cholesterol (HDL-C) has been identified in population studies as an independent inverse predictor of cardiovascular events. Although the causal nature of this association has been questioned, HDL and its major protein, apolipoprotein (apo)A1, have been shown to prevent and reverse atherosclerosis in animal models. In addition, HDL and apoA1 have several putatively atheroprotective functions, such as the ability to promote efflux of cholesterol from macrophages in the artery wall, inhibit vascular inflammation, and enhance endothelial function. Therefore, HDL-C and apoA1 have been investigated as therapeutic targets for coronary heart disease. However, recent clinical trials with drugs that raise HDL-C, such as niacin and inhibitors of cholesteryl ester transfer protein, have been disappointing. Here, we review the current state of the science regarding HDL as a therapeutic target. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/24713591",
+    "type": "article-journal",
+    "id": "13meq3Hgt",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:24713591"
+  },
+  {
+    "title": "Effects of extended-release niacin with laropiprant in high-risk patients.",
+    "volume": "371",
+    "issue": "3",
+    "page": "203-12",
+    "container-title": "The New England journal of medicine",
+    "container-title-short": "N Engl J Med",
+    "ISSN": "1533-4406",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          7,
+          17
+        ]
+      ]
+    },
+    "author": [
+      {},
+      {
+        "given": "Martin J",
+        "family": "Landray"
+      },
+      {
+        "given": "Richard",
+        "family": "Haynes"
+      },
+      {
+        "given": "Jemma C",
+        "family": "Hopewell"
+      },
+      {
+        "given": "Sarah",
+        "family": "Parish"
+      },
+      {
+        "given": "Theingi",
+        "family": "Aung"
+      },
+      {
+        "given": "Joseph",
+        "family": "Tomson"
+      },
+      {
+        "given": "Karl",
+        "family": "Wallendszus"
+      },
+      {
+        "given": "Martin",
+        "family": "Craig"
+      },
+      {
+        "given": "Lixin",
+        "family": "Jiang"
+      },
+      {
+        "given": "Rory",
+        "family": "Collins"
+      },
+      {
+        "given": "Jane",
+        "family": "Armitage"
+      }
+    ],
+    "PMID": "25014686",
+    "DOI": "10.1056/nejmoa1300955",
+    "abstract": "Patients with evidence of vascular disease are at increased risk for subsequent vascular events despite effective use of statins to lower the low-density lipoprotein (LDL) cholesterol level. Niacin lowers the LDL cholesterol level and raises the high-density lipoprotein (HDL) cholesterol level, but its clinical efficacy and safety are uncertain.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25014686",
+    "type": "article-journal",
+    "id": "13ZGxHjQ5",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25014686"
+  },
+  {
+    "title": "A narrow repertoire of transcriptional modules responsive to pyogenic bacteria is impaired in patients carrying loss-of-function mutations in MYD88 or IRAK4.",
+    "volume": "15",
+    "issue": "12",
+    "page": "1134-42",
+    "container-title": "Nature immunology",
+    "container-title-short": "Nat Immunol",
+    "ISSN": "1529-2916",
+    "issued": {
+      "date-parts": [
+        [
+          2014,
+          10,
+          26
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Laia",
+        "family": "Alsina"
+      },
+      {
+        "given": "Elisabeth",
+        "family": "Israelsson"
+      },
+      {
+        "given": "Matthew C",
+        "family": "Altman"
+      },
+      {
+        "given": "Kristen K",
+        "family": "Dang"
+      },
+      {
+        "given": "Pegah",
+        "family": "Ghandil"
+      },
+      {
+        "given": "Laura",
+        "family": "Israel"
+      },
+      {
+        "given": "Horst",
+        "family": "von Bernuth"
+      },
+      {
+        "given": "Nicole",
+        "family": "Baldwin"
+      },
+      {
+        "given": "Huanying",
+        "family": "Qin"
+      },
+      {
+        "given": "Zongbo",
+        "family": "Jin"
+      },
+      {
+        "given": "Romain",
+        "family": "Banchereau"
+      },
+      {
+        "given": "Esperanza",
+        "family": "Anguiano"
+      },
+      {
+        "given": "Alexei",
+        "family": "Ionan"
+      },
+      {
+        "given": "Laurent",
+        "family": "Abel"
+      },
+      {
+        "given": "Anne",
+        "family": "Puel"
+      },
+      {
+        "given": "Capucine",
+        "family": "Picard"
+      },
+      {
+        "given": "Virginia",
+        "family": "Pascual"
+      },
+      {
+        "given": "Jean Laurent",
+        "family": "Casanova"
+      },
+      {
+        "given": "Damien",
+        "family": "Chaussabel"
+      }
+    ],
+    "PMID": "25344726",
+    "PMCID": "PMC4281021",
+    "DOI": "10.1038/ni.3028",
+    "abstract": "Loss of function of the kinase IRAK4 or the adaptor MyD88 in humans interrupts a pathway critical for pathogen sensing and ignition of inflammation. However, patients with loss-of-function mutations in the genes encoding these factors are, unexpectedly, susceptible to only a limited range of pathogens. We employed a systems approach to investigate transcriptome responses following in vitro exposure of patients' blood to agonists of Toll-like receptors (TLRs) and receptors for interleukin 1 (IL-1Rs) and to whole pathogens. Responses to purified agonists were globally abolished, but variable residual responses were present following exposure to whole pathogens. Further delineation of the latter responses identified a narrow repertoire of transcriptional programs affected by loss of MyD88 function or IRAK4 function. Our work introduces the use of a systems approach for the global assessment of innate immune responses and the characterization of human primary immunodeficiencies. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25344726",
+    "type": "article-journal",
+    "id": "SjGoBywE",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25344726"
+  },
+  {
+    "title": "Prognostic value of grip strength: findings from the Prospective Urban Rural Epidemiology (PURE) study.",
+    "volume": "386",
+    "issue": "9990",
+    "page": "266-73",
+    "container-title": "Lancet (London, England)",
+    "container-title-short": "Lancet",
+    "ISSN": "1474-547X",
+    "issued": {
+      "date-parts": [
+        [
+          2015,
+          5,
+          13
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Darryl P",
+        "family": "Leong"
+      },
+      {
+        "given": "Koon K",
+        "family": "Teo"
+      },
+      {
+        "given": "Sumathy",
+        "family": "Rangarajan"
+      },
+      {
+        "given": "Patricio",
+        "family": "Lopez-Jaramillo"
+      },
+      {
+        "given": "Alvaro",
+        "family": "Avezum"
+      },
+      {
+        "given": "Andres",
+        "family": "Orlandini"
+      },
+      {
+        "given": "Pamela",
+        "family": "Seron"
+      },
+      {
+        "given": "Suad H",
+        "family": "Ahmed"
+      },
+      {
+        "given": "Annika",
+        "family": "Rosengren"
+      },
+      {
+        "given": "Roya",
+        "family": "Kelishadi"
+      },
+      {
+        "given": "Omar",
+        "family": "Rahman"
+      },
+      {
+        "given": "Sumathi",
+        "family": "Swaminathan"
+      },
+      {
+        "given": "Romaina",
+        "family": "Iqbal"
+      },
+      {
+        "given": "Rajeev",
+        "family": "Gupta"
+      },
+      {
+        "given": "Scott A",
+        "family": "Lear"
+      },
+      {
+        "given": "Aytekin",
+        "family": "Oguz"
+      },
+      {
+        "given": "Khalid",
+        "family": "Yusoff"
+      },
+      {
+        "given": "Katarzyna",
+        "family": "Zatonska"
+      },
+      {
+        "given": "Jephat",
+        "family": "Chifamba"
+      },
+      {
+        "given": "Ehimario",
+        "family": "Igumbor"
+      },
+      {
+        "given": "Viswanathan",
+        "family": "Mohan"
+      },
+      {
+        "given": "Ranjit Mohan",
+        "family": "Anjana"
+      },
+      {
+        "given": "Hongqiu",
+        "family": "Gu"
+      },
+      {
+        "given": "Wei",
+        "family": "Li"
+      },
+      {
+        "given": "Salim",
+        "family": "Yusuf"
+      },
+      {}
+    ],
+    "PMID": "25982160",
+    "DOI": "10.1016/s0140-6736(14)62000-6",
+    "abstract": "Reduced muscular strength, as measured by grip strength, has been associated with an increased risk of all-cause and cardiovascular mortality. Grip strength is appealing as a simple, quick, and inexpensive means of stratifying an individual's risk of cardiovascular death. However, the prognostic value of grip strength with respect to the number and range of populations and confounders is unknown. The aim of this study was to assess the independent prognostic importance of grip strength measurement in socioculturally and economically diverse countries.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/25982160",
+    "type": "article-journal",
+    "id": "aBVh8zt1",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:25982160"
+  },
+  {
+    "title": "Optimized sgRNA design to maximize activity and minimize off-target effects of CRISPR-Cas9.",
+    "volume": "34",
+    "issue": "2",
+    "page": "184-191",
+    "container-title": "Nature biotechnology",
+    "container-title-short": "Nat Biotechnol",
+    "ISSN": "1546-1696",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          1,
+          18
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "John G",
+        "family": "Doench"
+      },
+      {
+        "given": "Nicolo",
+        "family": "Fusi"
+      },
+      {
+        "given": "Meagan",
+        "family": "Sullender"
+      },
+      {
+        "given": "Mudra",
+        "family": "Hegde"
+      },
+      {
+        "given": "Emma W",
+        "family": "Vaimberg"
+      },
+      {
+        "given": "Katherine F",
+        "family": "Donovan"
+      },
+      {
+        "given": "Ian",
+        "family": "Smith"
+      },
+      {
+        "given": "Zuzana",
+        "family": "Tothova"
+      },
+      {
+        "given": "Craig",
+        "family": "Wilen"
+      },
+      {
+        "given": "Robert",
+        "family": "Orchard"
+      },
+      {
+        "given": "Herbert W",
+        "family": "Virgin"
+      },
+      {
+        "given": "Jennifer",
+        "family": "Listgarten"
+      },
+      {
+        "given": "David E",
+        "family": "Root"
+      }
+    ],
+    "PMID": "26780180",
+    "PMCID": "PMC4744125",
+    "DOI": "10.1038/nbt.3437",
+    "abstract": "CRISPR-Cas9-based genetic screens are a powerful new tool in biology. By simply altering the sequence of the single-guide RNA (sgRNA), one can reprogram Cas9 to target different sites in the genome with relative ease, but the on-target activity and off-target effects of individual sgRNAs can vary widely. Here, we use recently devised sgRNA design rules to create human and mouse genome-wide libraries, perform positive and negative selection screens and observe that the use of these rules produced improved results. Additionally, we profile the off-target activity of thousands of sgRNAs and develop a metric to predict off-target sites. We incorporate these findings from large-scale, empirical data to improve our computational design rules and create optimized sgRNA libraries that maximize on-target activity and minimize off-target effects to enable more effective and efficient genetic screens and genome engineering. ",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/26780180",
+    "type": "article-journal",
+    "id": "vNXTnmxp",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:26780180"
+  },
+  {
+    "title": "Avoiding common pitfalls when clustering biological data.",
+    "volume": "9",
+    "issue": "432",
+    "page": "re6",
+    "container-title": "Science signaling",
+    "container-title-short": "Sci Signal",
+    "ISSN": "1937-9145",
+    "issued": {
+      "date-parts": [
+        [
+          2016,
+          6,
+          14
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Tom",
+        "family": "Ronan"
+      },
+      {
+        "given": "Zhijie",
+        "family": "Qi"
+      },
+      {
+        "given": "Kristen M",
+        "family": "Naegle"
+      }
+    ],
+    "PMID": "27303057",
+    "DOI": "10.1126/scisignal.aad1932",
+    "abstract": "Clustering is an unsupervised learning method, which groups data points based on similarity, and is used to reveal the underlying structure of data. This computational approach is essential to understanding and visualizing the complex data that are acquired in high-throughput multidimensional biological experiments. Clustering enables researchers to make biological inferences for further experiments. Although a powerful technique, inappropriate application can lead biological researchers to waste resources and time in experimental follow-up. We review common pitfalls identified from the published molecular biology literature and present methods to avoid them. Commonly encountered pitfalls relate to the high-dimensional nature of biological data from high-throughput experiments, the failure to consider more than one clustering method for a given problem, and the difficulty in determining whether clustering has produced meaningful results. We present concrete examples of problems and solutions (clustering results) in the form of toy problems and real biological data for these issues. We also discuss ensemble clustering as an easy-to-implement method that enables the exploration of multiple clustering solutions and improves robustness of clustering solutions. Increased awareness of common clustering pitfalls will help researchers avoid overinterpreting or misinterpreting the results and missing valuable insights when clustering biological data.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/27303057",
+    "type": "article-journal",
+    "id": "S7LBsfcF",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:27303057"
+  },
+  {
+    "title": "Assessment of the Role of Niacin in Managing Cardiovascular Disease Outcomes: A Systematic Review and Meta-analysis.",
+    "volume": "2",
+    "issue": "4",
+    "page": "e192224",
+    "container-title": "JAMA network open",
+    "container-title-short": "JAMA Netw Open",
+    "ISSN": "2574-3805",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4,
+          5
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Elvira",
+        "family": "D'Andrea"
+      },
+      {
+        "given": "Spencer P",
+        "family": "Hey"
+      },
+      {
+        "given": "Cherie L",
+        "family": "Ramirez"
+      },
+      {
+        "given": "Aaron S",
+        "family": "Kesselheim"
+      }
+    ],
+    "PMID": "30977858",
+    "PMCID": "PMC6481429",
+    "DOI": "10.1001/jamanetworkopen.2019.2224",
+    "abstract": "Niacin remains a therapeutic option for patients with cardiovascular disease, but recent studies have called into question the effectiveness of other drugs that increase high-density lipoprotein cholesterol levels.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/30977858",
+    "type": "article-journal",
+    "id": "ZGvG75Bj",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:30977858"
+  },
+  {
+    "title": "Shared and distinct genetic risk factors for childhood-onset and adult-onset asthma: genome-wide and transcriptome-wide studies.",
+    "volume": "7",
+    "issue": "6",
+    "page": "509-522",
+    "container-title": "The Lancet. Respiratory medicine",
+    "container-title-short": "Lancet Respir Med",
+    "ISSN": "2213-2619",
+    "issued": {
+      "date-parts": [
+        [
+          2019,
+          4,
+          27
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Milton",
+        "family": "Pividori"
+      },
+      {
+        "given": "Nathan",
+        "family": "Schoettler"
+      },
+      {
+        "given": "Dan L",
+        "family": "Nicolae"
+      },
+      {
+        "given": "Carole",
+        "family": "Ober"
+      },
+      {
+        "given": "Hae Kyung",
+        "family": "Im"
+      }
+    ],
+    "PMID": "31036433",
+    "PMCID": "PMC6534440",
+    "DOI": "10.1016/s2213-2600(19)30055-4",
+    "abstract": "Childhood-onset and adult-onset asthma differ with respect to severity and comorbidities. Whether they also differ with respect to genetic risk factors has not been previously investigated in large samples. The goals of this study were to identify shared and distinct genetic risk loci for childhood-onset and adult-onset asthma, and to identify the genes that might mediate the effects of associated variation.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/31036433",
+    "type": "article-journal",
+    "id": "zwpq2IXD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:31036433"
+  },
+  {
+    "title": "UTMOST, a single and cross-tissue TWAS (Transcriptome Wide Association Study), reveals new ASD (Autism Spectrum Disorder) associated genes.",
+    "volume": "11",
+    "issue": "1",
+    "page": "256",
+    "container-title": "Translational psychiatry",
+    "container-title-short": "Transl Psychiatry",
+    "ISSN": "2158-3188",
+    "issued": {
+      "date-parts": [
+        [
+          2021,
+          4,
+          30
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "Cristina",
+        "family": "Rodriguez-Fontenla"
+      },
+      {
+        "given": "Angel",
+        "family": "Carracedo"
+      }
+    ],
+    "PMID": "33931583",
+    "PMCID": "PMC8087708",
+    "DOI": "10.1038/s41398-021-01378-8",
+    "abstract": "Autism spectrum disorders (ASD) is a complex neurodevelopmental disorder that may significantly impact on the affected individual's life. Common variation (SNPs) could explain about 50% of ASD heritability. Despite this fact and the large size of the last GWAS meta-analysis, it is believed that hundreds of risk genes in ASD have yet to be discovered. New tools, such as TWAS (Transcriptome Wide Association Studies) which integrate tissue expression and genetic data, are a great approach to identify new ASD susceptibility genes. The main goal of this study is to use UTMOST with the publicly available summary statistics from the largest ASD GWAS meta-analysis as genetic input. In addition, an in silico biological characterization for the novel associated loci was performed. Our results have shown the association of 4 genes at the brain level (CIPC, PINX1, NKX2-2, and PTPRE) and have highlighted the association of NKX2-2, MANBA, ERI1, and MITF at the gastrointestinal level. The gastrointestinal associations are quite relevant given the well-established but unexplored relationship between ASD and gastrointestinal symptoms. Cross-tissue analysis has shown the association of NKX2-2 and BLK. UTMOST-associated genes together with their in silico biological characterization seems to point to different biological mechanisms underlying ASD etiology. Thus, it would not be restricted to brain tissue and it will involve the participation of other body tissues such as the gastrointestinal.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/33931583",
+    "type": "article-journal",
+    "id": "ktVcsmYD",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:33931583"
+  },
+  {
+    "title": "Cluster analysis and display of genome-wide expression patterns.",
+    "volume": "95",
+    "issue": "25",
+    "page": "14863-8",
+    "container-title": "Proceedings of the National Academy of Sciences of the United States of America",
+    "container-title-short": "Proc Natl Acad Sci U S A",
+    "ISSN": "0027-8424",
+    "issued": {
+      "date-parts": [
+        [
+          1998,
+          12,
+          8
+        ]
+      ]
+    },
+    "author": [
+      {
+        "given": "M B",
+        "family": "Eisen"
+      },
+      {
+        "given": "P T",
+        "family": "Spellman"
+      },
+      {
+        "given": "P O",
+        "family": "Brown"
+      },
+      {
+        "given": "D",
+        "family": "Botstein"
+      }
+    ],
+    "PMID": "9843981",
+    "PMCID": "PMC24541",
+    "DOI": "10.1073/pnas.95.25.14863",
+    "abstract": "A system of cluster analysis for genome-wide expression data from DNA microarray hybridization is described that uses standard statistical algorithms to arrange genes according to similarity in pattern of gene expression. The output is displayed graphically, conveying the clustering and the underlying expression data simultaneously in a form intuitive for biologists. We have found in the budding yeast Saccharomyces cerevisiae that clustering gene expression data groups together efficiently genes of known similar function, and we find a similar tendency in human data. Thus patterns seen in genome-wide expression experiments can be interpreted as indications of the status of cellular processes. Also, coexpression of genes of known function with poorly characterized or novel genes may provide a simple means of gaining leads to the functions of many genes for which information is not available currently.",
+    "URL": "https://www.ncbi.nlm.nih.gov/pubmed/9843981",
+    "type": "article-journal",
+    "id": "S4e4WaP3",
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: pubmed:9843981"
+  },
+  {
+    "id": "16RTdMKxI",
+    "type": "book",
+    "note": "original-date: 2019-04-03T09:07:14Z\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://github.com/EBISPOT/EFO-UKB-mappings",
+    "publisher": "EBISPOT",
+    "source": "GitHub",
+    "title": "Mapping UK Biobank to the Experimental Factor Ontology (EFO)",
+    "URL": "https://github.com/EBISPOT/EFO-UKB-mappings",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "issued": {
+      "date-parts": [
+        [
+          "2022",
+          5,
+          3
+        ]
+      ]
+    }
+  },
+  {
+    "id": "GPHGnFRN",
+    "type": "book",
+    "abstract": "harmonization, liftover, and imputation of summary statistics from GWAS",
+    "genre": "Python",
+    "note": "original-date: 2018-10-26T20:24:35Z\nThis CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://github.com/hakyimlab/summary-gwas-imputation",
+    "publisher": "hakyimlab",
+    "source": "GitHub",
+    "title": "Harmonization and Imputation Overview",
+    "URL": "https://github.com/hakyimlab/summary-gwas-imputation",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "issued": {
+      "date-parts": [
+        [
+          "2022",
+          10,
+          31
+        ]
+      ]
+    }
+  },
+  {
+    "id": "VKYS05n1",
+    "type": "webpage",
+    "abstract": "A BioProject is a collection of biological data related to a single initiative, originating from a single organization or from a consortium. A BioProject record provides users a single place to find links to the diverse data types generated for that project",
+    "title": "Homo sapiens (ID 232177) - BioProject - NCBI",
+    "URL": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA232177",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA232177"
+  },
+  {
+    "id": "11eausmiy",
+    "type": "webpage",
+    "abstract": "A BioProject is a collection of biological data related to a single initiative, originating from a single organization or from a consortium. A BioProject record provides users a single place to find links to the diverse data types generated for that project",
+    "title": "Homo sapiens (ID 258384) - BioProject - NCBI",
+    "URL": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA258384",
+    "accessed": {
+      "date-parts": [
+        [
+          "2022",
+          11,
+          29
+        ]
+      ]
+    },
+    "note": "This CSL Item was generated by Manubot v0.5.2 from its persistent identifier (standard_id).\nstandard_id: url:https://www.ncbi.nlm.nih.gov/bioproject/PRJNA258384"
+  }
+]
diff --git a/tests/manuscripts/phenoplier_full_only_first_para/metadata.yaml b/tests/manuscripts/phenoplier_full_only_first_para/metadata.yaml
new file mode 100644
index 0000000..ea84d03
--- /dev/null
+++ b/tests/manuscripts/phenoplier_full_only_first_para/metadata.yaml
@@ -0,0 +1,134 @@
+---
+title: "Projecting genetic associations through gene expression patterns highlights disease etiology and drug mechanisms"
+date: 2023-09-09  # Defaults to date generated, but can specify like '2022-10-31'.
+keywords:
+  - genetic studies
+  - functional genomics
+  - gene co-expression
+  - therapeutic targets
+  - drug repurposing
+  - clustering of complex traits
+lang: en-US
+authors:
+  - name: Milton Pividori
+    github: miltondp
+    initials: MP
+    orcid: 0000-0002-3035-4403
+    twitter: miltondp
+    mastodon: miltondp
+    mastodon-server: genomic.social
+    email: milton.pividori@cuanschutz.edu
+    affiliations:
+      - Department of Biomedical Informatics, University of Colorado School of Medicine, Aurora, CO 80045, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+    funders:
+      - The Gordon and Betty Moore Foundation GBMF 4552
+      - The National Human Genome Research Institute (R01 HG010067)
+      - The National Human Genome Research Institute (K99HG011898)
+      - The Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765)
+  
+  - name: Sumei Lu
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Binglan Li
+    orcid: 0000-0002-0103-6107
+    affiliations:
+      - Department of Biomedical Data Science, Stanford University, Stanford, CA 94305, USA.
+
+  - name: Chun Su
+    orcid:  0000-0001-6388-8666
+    github: sckinta
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Matthew E. Johnson
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+
+  - name: Wei-Qi Wei
+    affiliations:
+      - Vanderbilt University Medical Center, Nashville, TN 37232, USA
+
+  - name: Qiping Feng
+    orcid: 0000-0002-6213-793X
+    affiliations:
+      - Vanderbilt University Medical Center, Nashville, TN 37232, USA
+
+  - name: Bahram Namjou
+    affiliations:
+      - Cincinnati Children's Hospital Medical Center, Cincinnati, OH 45229, USA
+
+  - name: Krzysztof Kiryluk
+    orcid: 0000-0002-5047-6715
+    twitter: kirylukk
+    affiliations:
+      - Department of Medicine, Division of Nephrology, Vagelos College of Physicians \& Surgeons, Columbia University, New York, NY 10032, USA
+
+  - name: Iftikhar Kullo
+    affiliations:
+      - Mayo Clinic, Rochester, MN 55905, USA
+
+  - name: Yuan Luo
+    orcid: 0000-0003-0195-7456
+    affiliations:
+      - Northwestern University, Chicago, IL 60611, USA
+
+  - name: Blair D. Sullivan
+    github: bdsullivan
+    orcid: 0000-0001-7720-6208
+    twitter: blairdsullivan
+    affiliations:
+      - Kahlert School of Computing, University of Utah, Salt Lake City, UT 84112, USA
+
+  - name: Benjamin F. Voight
+    orcid: 0000-0002-6205-9994
+    twitter: bvoight28
+    github: bvoight
+    affiliations:
+      - Department of Systems Pharmacology and Translational Therapeutics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+      - Institute for Translational Medicine and Therapeutics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Carsten Skarke
+    orcid: 0000-0001-5145-3681
+    twitter: CarstenSkarke
+    affiliations:
+      - Institute for Translational Medicine and Therapeutics, Department of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Marylyn D. Ritchie
+    initials: MDR
+    orcid: 0000-0002-1208-1720
+    twitter: MarylynRitchie
+    email: marylyn@pennmedicine.upenn.edu
+    affiliations:
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA 19104, USA
+
+  - name: Struan F.A. Grant
+    email: grants@chop.edu
+    orcid: 0000-0003-2025-5302
+    twitter: STRUANGRANT
+    affiliations:
+      - Center for Spatial and Functional Genomics, Children's Hospital of Philadelphia, Philadelphia, PA 19104, USA
+      - Division of Endocrinology and Diabetes, Children's Hospital of Philadelphia, Philadelphia, PA, 19104, USA
+      - Division of Human Genetics, Children's Hospital of Philadelphia, Philadelphia, PA, 19104, USA
+      - Department of Pediatrics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA, 19104, USA
+      - Department of Genetics, Perelman School of Medicine, University of Pennsylvania, Philadelphia, PA, 19104, USA
+
+  - name: Casey S. Greene
+    github: cgreene
+    initials: CSG
+    orcid: 0000-0001-8713-9213
+    twitter: GreeneScientist
+    mastodon: greenescientist
+    mastodon-server: genomic.social
+    email: casey.s.greene@cuanschutz.edu
+    affiliations:
+      - Center for Health AI, University of Colorado School of Medicine, Aurora, CO 80045, USA
+      - Department of Biomedical Informatics, University of Colorado School of Medicine, Aurora, CO 80045, USA
+    funders:
+      - The Gordon and Betty Moore Foundation (GBMF 4552)
+      - The National Human Genome Research Institute (R01 HG010067)
+      - The National Cancer Institute (R01 CA237170)
+      - The Eunice Kennedy Shriver National Institute of Child Health and Human Development (R01 HD109765)
+    corresponding: true
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index d2bb2de..da86785 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -152,8 +152,8 @@ def test_unresolved_gets_default_prompt():
 # ---
 
 
-def get_editor():
-    content_dir = MANUSCRIPTS_DIR.resolve(strict=True)
+def get_editor(manuscript_dir=MANUSCRIPTS_DIR):
+    content_dir = manuscript_dir.resolve(strict=True)
     editor = ManuscriptEditor(content_dir)
     assert isinstance(editor, ManuscriptEditor)
     return editor
@@ -283,14 +283,8 @@ def test_revise_entire_manuscript(tmp_path, model):
     Path(__file__).parent / "config_loader_fixtures" / "prompt_propogation"
 )
 
-@pytest.mark.parametrize(
-    "model",
-    [
-        DebuggingManuscriptRevisionModel(),
-    ],
-)
 @mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PROMPT_PROPOGATION_CONFIG_DIR))
-def test_prompts_in_final_result(tmp_path, model):
+def test_prompts_in_final_result(tmp_path):
     """
     Tests that the prompts are making it into the final resulting .md files.
 
@@ -308,8 +302,9 @@ def test_prompts_in_final_result(tmp_path, model):
     """
     me = get_editor()
 
-    model.title = me.title
-    model.keywords = me.keywords
+    model = DebuggingManuscriptRevisionModel(
+        title=me.title, keywords=me.keywords
+    )
 
     output_folder = tmp_path
     assert output_folder.exists()
@@ -341,24 +336,22 @@ def test_prompts_in_final_result(tmp_path, model):
             content = f.read()
             assert files_to_prompts[output_md_file.name].strip() in content
 
-# live GPT version of the test, with a different prompt
+
+# ---------
+# --- live GPT version of the test, with a different prompt
+# ---------
+
+# to save on time/cost, we use a version of the phenoplier manuscript that only
+# contains the first paragraph of each section
+BRIEF_MANUSCRIPTS_DIR = Path(__file__).parent / "manuscripts" / "phenoplier_full_only_first_para"
 
 PROMPT_PROPOGATION_CONFIG_DIR = (
     Path(__file__).parent / "config_loader_fixtures" / "prompt_gpt3_e2e"
 )
 
-@pytest.mark.parametrize(
-    "model",
-    [
-        GPT3CompletionModel(
-            title="Debug Manuscript", keywords=["debug"],
-            model_engine="gpt-3.5-turbo"
-        ),
-    ],
-)
 @pytest.mark.cost
-@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, PROMPT_PROPOGATION_CONFIG_DIR))
-def test_prompts_apply_gpt3(tmp_path, model):
+@mock.patch("builtins.open", mock_unify_open(BRIEF_MANUSCRIPTS_DIR, PROMPT_PROPOGATION_CONFIG_DIR))
+def test_prompts_apply_gpt3(tmp_path):
     """
     Tests that the custom prompts are applied when actually applying
     the prompts to an LLM.
@@ -371,10 +364,12 @@ def test_prompts_apply_gpt3(tmp_path, model):
     As with test_prompts_in_final_result above, files that have no input and 
     thus no applied prompt are ignored.
     """
-    me = get_editor()
+    me = get_editor(manuscript_dir=BRIEF_MANUSCRIPTS_DIR)
 
-    model.title = me.title
-    model.keywords = me.keywords
+    model = GPT3CompletionModel(
+        title="Debug Manuscript", keywords=["debug"],
+        model_engine="gpt-3.5-turbo"
+    )
 
     output_folder = tmp_path
     assert output_folder.exists()

From 7084475945ae84a7cbc09bec082eb534b758a3f6 Mon Sep 17 00:00:00 2001
From: Milton Pividori <miltondp@gmail.com>
Date: Wed, 17 Apr 2024 07:32:22 -0600
Subject: [PATCH 28/44] models: change model_engine to gpt-3.5-turbo

---
 libs/manubot_ai_editor/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index fc90c2b..1c1df24 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -130,7 +130,7 @@ def __init__(
         title: str,
         keywords: list[str],
         openai_api_key: str = None,
-        model_engine: str = "text-davinci-003",
+        model_engine: str = "gpt-3.5-turbo",
         temperature: float = 0.5,
         presence_penalty: float = None,
         frequency_penalty: float = None,

From d672346cf4a4d60811766f95d6be461284b91a76 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 17 Apr 2024 13:47:11 -0600
Subject: [PATCH 29/44] Updated model_basics tests to check for
 "gpt-3.5-turbo", not "text-davinci-003", since that's the new default

---
 tests/test_model_basics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_model_basics.py b/tests/test_model_basics.py
index 0001229..8242ca9 100644
--- a/tests/test_model_basics.py
+++ b/tests/test_model_basics.py
@@ -78,7 +78,7 @@ def test_model_object_init_default_language_model():
         keywords=["test", "keywords"],
     )
 
-    assert model.model_parameters["model"] == "text-davinci-003"
+    assert model.model_parameters["model"] == "gpt-3.5-turbo"
 
 
 @mock.patch.dict("os.environ", {env_vars.LANGUAGE_MODEL: "text-curie-001"})
@@ -98,7 +98,7 @@ def test_model_object_init_read_language_model_from_environment_is_empty():
         keywords=["test", "keywords"],
     )
 
-    assert model.model_parameters["model"] == "text-davinci-003"
+    assert model.model_parameters["model"] == "gpt-3.5-turbo"
 
 
 def test_get_max_tokens_fraction_is_one():

From fa28c19fc9216844185863e181e56066b0ccf111 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:00:40 -0600
Subject: [PATCH 30/44] DebuggingManuscriptRevisionModel() no longer supplies
 default title or keyword args, since we typically want to customize them

---
 libs/manubot_ai_editor/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 1c1df24..e4a53da 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -585,8 +585,8 @@ class DebuggingManuscriptRevisionModel(GPT3CompletionModel):
     name and the resolved prompt.
     """
 
-    def __init__(self, title: str = "debugging-manuscript", keywords: list[str] = None, **kwargs):
-        super().__init__(title, keywords, **kwargs)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
     def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):
         params = self.get_params(paragraph_text, section_name, resolved_prompt)

From 1fb08b8cea4731c28f221debf356f313eace872b Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:04:40 -0600
Subject: [PATCH 31/44] Removes hardcoded front-matter ignore in favor of using
 other ignore mechanisms

---
 libs/manubot_ai_editor/editor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
index afbf69a..7f752c4 100644
--- a/libs/manubot_ai_editor/editor.py
+++ b/libs/manubot_ai_editor/editor.py
@@ -462,10 +462,6 @@ def revise_manuscript(
                 filenames_to_revise = None
 
         for filename in sorted(self.content_dir.glob("*.md")):
-            # ignore front-matter file
-            if "front-matter" in filename.name:
-                continue
-
             filename_section = self.get_section_from_filename(filename.name)
 
             # use the ai_revision prompt config to attempt to resolve a prompt

From 6c08714d77324ccd4f82ee8e0c230aec66a6c2b9 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:05:36 -0600
Subject: [PATCH 32/44] Adds test for correct title, keywords in resulting
 prompts.

---
 .../ai_revision-prompts.yaml                  |  2 +-
 .../ai_revision-prompts.yaml                  |  2 +-
 tests/test_prompt_config.py                   | 38 +++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
index 8c8f9d6..e4f2cf4 100644
--- a/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/both_prompts_config/ai_revision-prompts.yaml
@@ -37,4 +37,4 @@ prompts:
        and the text has a clear sentence structure
 
   default: |
-    Proofread the following paragraph
+    Proofread the following paragraph (with the title '{title}' and keywords '{keywords}')
diff --git a/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml b/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml
index bbb95a4..12bbdbb 100644
--- a/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml
+++ b/tests/config_loader_fixtures/prompt_propogation/ai_revision-prompts.yaml
@@ -1,7 +1,7 @@
 prompts:
   front_matter: This is the front-matter prompt
   abstract: This is the abstract prompt
-  introduction: This is the introduction prompt
+  introduction: "This is the introduction prompt for the paper titled '{title}'."
   results: This is the results prompt
   results_framework: This is the results_framework prompt
   crispr: This is the crispr prompt
diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index da86785..b0b951f 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -275,6 +275,44 @@ def test_revise_entire_manuscript(tmp_path, model):
     output_md_files = list(output_folder.glob("*.md"))
     assert len(output_md_files) == 9
 
+
+@mock.patch("builtins.open", mock_unify_open(MANUSCRIPTS_DIR, BOTH_PROMPTS_CONFIG_DIR))
+def test_revise_entire_manuscript_includes_title_keywords(tmp_path):
+    from os.path import basename
+
+    print(f"\n{str(tmp_path)}\n")
+    me = get_editor()
+
+    model = DebuggingManuscriptRevisionModel(
+        title="Test title", keywords=["test", "keywords"]
+    )
+
+    # ensure overwriting the title and keywords works
+    model.title = me.title
+    model.keywords = me.keywords
+
+    output_folder = tmp_path
+    assert output_folder.exists()
+
+    me.revise_manuscript(output_folder, model)
+
+    # gather up the output files so we can check their contents
+    output_md_files = list(output_folder.glob("*.md"))
+
+    # check that the title and keywords are in the final result
+    # for prompts that include that information
+    for output_md_file in output_md_files:
+        # we expressly skip results because it doesn't contain any revisable
+        # paragraphs
+        if "results" in output_md_file.name:
+            continue
+
+        with open(output_md_file, "r") as f:
+            content = f.read()
+            assert me.title in content, f"not found in filename: {basename(output_md_file)}"
+            assert ", ".join(me.keywords) in content, f"not found in filename: {basename(output_md_file)}"
+
+
 # ==============================================================================
 # === end-to-end tests, to verify that the prompts are making it into the final result
 # ==============================================================================

From 98b174dbab1373638a9d61b3c601fe009987c131 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:08:57 -0600
Subject: [PATCH 33/44] Updates a few DebuggingManuscriptRevisionModel() uses
 that didn't supply title, keywords.

---
 tests/test_prompt_config.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index b0b951f..12aa3fb 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -254,7 +254,9 @@ def test_conflicting_sources_warning(capfd):
     "model",
     [
         RandomManuscriptRevisionModel(),
-        DebuggingManuscriptRevisionModel()
+        DebuggingManuscriptRevisionModel(
+            title="Test title", keywords=["test", "keywords"]
+        )
         # GPT3CompletionModel(None, None),
     ],
 )
@@ -405,8 +407,8 @@ def test_prompts_apply_gpt3(tmp_path):
     me = get_editor(manuscript_dir=BRIEF_MANUSCRIPTS_DIR)
 
     model = GPT3CompletionModel(
-        title="Debug Manuscript", keywords=["debug"],
-        model_engine="gpt-3.5-turbo"
+        title=me.title,
+        keywords=me.keywords
     )
 
     output_folder = tmp_path

From 8cf6ca8456655915bd41888088084241ba281e90 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:09:58 -0600
Subject: [PATCH 34/44] Finishes updating test for title inclusion in
 test_prompts_in_final_result().

---
 tests/test_prompt_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_prompt_config.py b/tests/test_prompt_config.py
index 12aa3fb..f0903f3 100644
--- a/tests/test_prompt_config.py
+++ b/tests/test_prompt_config.py
@@ -353,9 +353,9 @@ def test_prompts_in_final_result(tmp_path):
 
     # mapping of filenames to prompts to check in the result
     files_to_prompts = {
-        "00.front-matter.md": "This is the front-matter prompt",
+        "00.front-matter.md": "This is the front-matter prompt.",
         "01.abstract.md": "This is the abstract prompt",
-        "02.introduction.md": "This is the introduction prompt",
+        "02.introduction.md": "This is the introduction prompt for the paper titled '%s'." % me.title,
         # "04.00.results.md": "This is the results prompt",
         "04.05.00.results_framework.md": "This is the results_framework prompt",
         "04.05.01.crispr.md": "This is the crispr prompt",

From f9f89d17d7cbd4be813ba3e2b7069b3e9420d08c Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:11:51 -0600
Subject: [PATCH 35/44] Updates test that previously discounted front-matter to
 include it, since it's no longer hardcoded as ignored

---
 tests/test_editor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_editor.py b/tests/test_editor.py
index ea0d532..60bb778 100644
--- a/tests/test_editor.py
+++ b/tests/test_editor.py
@@ -1160,7 +1160,7 @@ def test_revise_entire_manuscript_non_standard_filenames_with_custom_prompt(
     me.revise_manuscript(output_folder, model)
 
     output_md_files = list(output_folder.glob("*.md"))
-    assert len(output_md_files) == 5
+    assert len(output_md_files) == 6
 
 
 @mock.patch.dict(

From 8ea6d5180eccac685e033cfd9e26fcb2490d5b5f Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 18 Apr 2024 17:18:33 -0600
Subject: [PATCH 36/44] Factors out replacements/placeholders dicts into a
 shared placeholders dict

---
 libs/manubot_ai_editor/models.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index e4a53da..3d8c5af 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -293,6 +293,16 @@ def get_prompt(
         # 4. finally, if none of the above are true, then a generic prompt is
         #    used.
 
+        # set of options to replace in the prompt text, e.g.
+        # {title} would be replaced with self.title, the title of
+        # the manuscript.
+        placeholders = {
+            "paragraph_text": paragraph_text.strip(),
+            "section_name": section_name,
+            "title": self.title,
+            "keywords": ", ".join(self.keywords),
+        }
+
         custom_prompt = None
         if (c := os.environ.get(env_vars.CUSTOM_PROMPT, "").strip()) and c != "":
             custom_prompt = c
@@ -300,26 +310,13 @@ def get_prompt(
                 f"Using custom prompt from environment variable '{env_vars.CUSTOM_PROMPT}'"
             )
 
-            placeholders = {
-                "paragraph_text": paragraph_text.strip(),
-                "section_name": section_name,
-                "title": self.title,
-                "keywords": ", ".join(self.keywords),
-            }
-
             # FIXME: if {paragraph_text} is in the prompt, this won't work for the edits endpoint
             #  a simple workaround is to remove {paragraph_text} from the prompt
             prompt = custom_prompt.format(**placeholders)
         elif resolved_prompt:
             # use the resolved prompt from the ai_revision config files, if available
             # replace placeholders with their actual values
-            replacements = {
-                "paragraph_text": paragraph_text.strip(),
-                "section_name": section_name,
-                "title": self.title,
-                "keywords": ", ".join(self.keywords),
-            }
-            prompt = resolved_prompt.format(**replacements)
+            prompt = resolved_prompt.format(**placeholders)
         elif section_name in ("abstract",):
             prompt = f"""
                 Revise the following paragraph from the {section_name} of an academic paper (with the title '{self.title}' and keywords '{", ".join(self.keywords)}')

From 3ae73decad5dbb633880440b2b5f0b6051c76320 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 1 May 2024 10:12:07 -0600
Subject: [PATCH 37/44] DebuggingManuscriptRevisionModel provides a default
 title and keywords so that tests that don't specify one will pass

---
 libs/manubot_ai_editor/models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 3d8c5af..023ff56 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -583,6 +583,11 @@ class DebuggingManuscriptRevisionModel(GPT3CompletionModel):
     """
 
     def __init__(self, *args, **kwargs):
+        if 'title' not in kwargs or kwargs['title'] is None:
+            kwargs['title'] = "Debugging Title"
+        if 'keywords' not in kwargs or kwargs['keywords'] is None:
+            kwargs['keywords'] = ["debugging", "keywords"]
+
         super().__init__(*args, **kwargs)
 
     def revise_paragraph(self, paragraph_text, section_name, resolved_prompt=None):

From 1766a7820345c8da3c672497c16b3a9585c97b7f Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 3 Jul 2024 21:04:46 -0600
Subject: [PATCH 38/44] Adds default prompt from MP that overrides
 section-specific canned prompts with a single default

---
 libs/manubot_ai_editor/models.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 023ff56..92b22a1 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -9,6 +9,26 @@
 
 from manubot_ai_editor import env_vars
 
+# if DEFAULT_PROMPT_OVERRIDE is not None, it's used in get_prompt() instead of the section-specific
+# 'canned' prompts.
+
+# specifically, the prompt resolution order is the following:
+# 1. if a custom prompt is specified via the env var specified by
+#    env_vars.CUSTOM_PROMPT, then the text in that env var is used as the
+#    prompt.
+# 2. if the files ai_revision-config.yaml and/or ai_revision-prompt.yaml are
+#    available, then a prompt resolved from the filename via those config files
+#    is used.
+# 3. if DEFAULT_PROMPT_OVERRIDE (below) is not None, then it is used as the
+#    prompt.
+# 4. if a section_name is specified, then a canned section-specific prompt
+#    matching the section name is used.
+# 5. finally, a generic prompt is constructed and used if none of the above
+#    conditions are met.
+DEFAULT_PROMPT_OVERRIDE = """
+Proofread the following paragraph that is part of a scientific manuscript.
+Keep all Markdown formatting, citations to other articles, mathematical expressions, and equations.
+"""
 
 class ManuscriptRevisionModel(ABC):
     """
@@ -317,6 +337,10 @@ def get_prompt(
             # use the resolved prompt from the ai_revision config files, if available
             # replace placeholders with their actual values
             prompt = resolved_prompt.format(**placeholders)
+        elif DEFAULT_PROMPT_OVERRIDE is not None:
+            # if a default prompt override is specified, use it in favor
+            # of any section-specific prompts
+            prompt = DEFAULT_PROMPT_OVERRIDE.format(**placeholders)
         elif section_name in ("abstract",):
             prompt = f"""
                 Revise the following paragraph from the {section_name} of an academic paper (with the title '{self.title}' and keywords '{", ".join(self.keywords)}')

From c68394a9faf3e68f061d715842e85e6e5dd5fa5e Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Wed, 3 Jul 2024 21:10:13 -0600
Subject: [PATCH 39/44] Adds info to the root README about configuring prompts.
 Adds a separate doc on how to use the custom prompts system.

---
 README.md              |  15 +++++
 docs/custom-prompts.md | 126 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 docs/custom-prompts.md

diff --git a/README.md b/README.md
index 056c930..f41e837 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,21 @@ The Manubot AI Editor can be used from the GitHub repository of a Manubot-based
 You first need to follow the steps to [setup a Manubot-based manuscript](https://github.com/manubot/rootstock).
 Then, follow [these instructions](https://github.com/manubot/rootstock/blob/main/USAGE.md#ai-assisted-authoring) to setup a workflow in GitHub Actions that will allow you to quickly trigger a job to revise your manuscript.
 
+### Configuring Prompts
+
+In order to revise your manuscript, prompts must be provided to the AI model. There are two ways to do this:
+- **Default prompts**: you can use the default prompts provided by the tool, in which case you don't need to do anything.
+- **Custom prompts**: you can define your own prompts to apply to specific files using YAML configuration files that you include with your manuscript.
+
+The default prompt, which should work for most manuscripts, is the following:
+
+```
+Proofread the following paragraph that is part of a scientific manuscript.
+Keep all Markdown formatting, citations to other articles, mathematical expressions, and equations.
+```
+
+If you wish to customize the prompts on a per-file basis, see [docs/custom-prompts.md](docs/custom-prompts.md) for more information.
+
 ### Command line
 
 To use the tool from the command line, you first need to install Manubot in a Python environment:
diff --git a/docs/custom-prompts.md b/docs/custom-prompts.md
new file mode 100644
index 0000000..9ed7261
--- /dev/null
+++ b/docs/custom-prompts.md
@@ -0,0 +1,126 @@
+# Custom Prompts
+
+Rather than using the default prompt, you can specify custom prompts for each file in your manuscript.
+This can be useful when you want specific sections of your manuscript to be revised in specific ways, or not revised at all.
+
+There are two ways that you can use the custom prompts system:
+1. You can define your prompts and how they map to your manuscript files in a single file, `ai-revision_prompts.yaml`.
+2. You can create the `ai-revision_prompts.yaml`, but only specify prompts and identifiers, which makes it suitable for sharing with others who have different names for their manuscripts' files.
+You would then specify a second file, `ai-revision_config.yaml`, that maps the prompt identifiers to the actual files in your manuscript.
+
+These files should be placed in the `content` directory alongside your manuscript markdown files.
+
+See [Functionality Notes](#functionality-notes) later in this document for more information on how to write regular expressions and use placeholders in your prompts.
+
+## Approach 1: Single file
+
+With this approach, you can define your prompts and how they map to your manuscript files in a single file.
+The single file should be named `ai-revision_prompts.yaml` and placed in the `content` folder.
+
+The file would look something like the following:
+
+```yaml
+prompts_files:
+  # filenames are specified as regular expressions
+  # in this case, we match a file named exactly 'filename.md'
+  ^filename\.md$: "Prompt text here"
+
+  # you can use YAML's multi-line string syntax to write longer prompts
+  # you can also use {placeholders} to include metadata from your manuscript
+  ^filename\.md$: |
+    Revise the following paragraph from a manuscript titled {title}
+    so that it sounds like an academic paper.
+
+  # specifying the special value 'null' will skip revising any files that
+  # match this regular expression
+  ^ignore_this_file\.md$: null
+```
+
+Note that, for each file, the first matching regular expression will determine its prompt or whether the file is skipped.
+Even if a file matches multiple regexes, only the first one will be used.
+
+
+## Approach 2: Prompt file plus configuration file
+
+In this case, we specify two files, `ai-revision_prompts.yaml` and `ai-revision_config.yaml`.
+
+The `ai-revision_prompts.yaml` file contains only the prompts and their identifiers.
+The top-level element is `prompts` in this case rather than `prompts_files`, as it defines a set of resuable prompts and not prompt-file mappings.
+
+Here's an example of what the `ai-revision_prompts.yaml` file might look like:
+```yaml
+prompts:
+  intro_prompt: "Prompt text here"
+  content_prompts: |
+    Revise the following paragraph from a manuscript titled {title}
+    so that it sounds like an academic paper.
+
+  my_default: "Revise this paragraph so it sounds nicer."
+```
+
+The `ai-revision_config.yaml` file maps the prompt identifiers to the actual files in your manuscript.
+
+An example of the `ai-revision_config.yaml` file:
+```yaml
+files:
+  matchings:
+    - files:
+        - ^introduction\.md$
+      prompt: intro_prompt
+    - files:
+        - ^methods\.md$
+        - ^abstract\.md$
+      prompt: content_prompts
+
+  # the special value default_prompt is used when no other regex matches
+  # it also uses a prompt identifier taken from ai-revision_prompts.yaml
+  default_prompt: my_default
+
+  # any file you want to be skipped can be specified in this list
+  ignores:
+    - ^ignore_this_file\.md$
+```
+
+Multiple regexes can be specified in a list under `files` to match multiple files to a single prompt.
+
+In this case, the `default_prompt` is used when no other regex matches, and it uses a prompt identifier taken from `ai-revision_prompts.yaml`.
+
+The `ignores` list specifies files that should be skipped entirely during the revision process; they won't have the default prompt applied to them.
+
+
+## Functionality Notes
+
+### Filenames as Regular Expressions
+
+Filenames in either approach are specified as regular expressions (aka "regexes").
+This allows you to flexibly match multiple files to a prompt with a single expression.
+
+A simple example: to specify an exact match for, say, `myfile.md`, you'd supply the regular expression `^myfile\.md$`, where:
+- `^` matches the beginning of the filename
+- `\.` matches a literal period -- otherwise, `.` means "any character"
+- `$` matches the end of the filename
+
+To illustrate why that syntax is important: if you were to write it as `myfile.md`, the `.` would match any character, so it would match `myfileAmd`, `myfile2md`, etc.
+Without the `^` and `$`, it would match also match filenames like `asdf_myfile.md`, `myfile.md_asdf`, and `asdf_myfile.md.txt`.
+
+The benefit of using regexes becomes more apparent when you have multiple files.
+For example, say you had three files, `02.prior-work.md`, `02.methods.md`, and `02.results.md`. To match all of these, you could use the expression `^02\..*\.md$`.
+This would match any file beginning with `02.` and ending with `.md`.
+Here, `.` again indicates "any character" and the `*` means "zero or more of the preceding character; together, they match any sequence of characters.
+
+You can find more information on how to write regular expressions in [Python's `re` module documentation](https://docs.python.org/3/library/re.html#regular-expression-syntax).
+
+
+### Placeholders
+
+The prompt text can include metadata from your manuscript, specified in `content/metadata.yaml` in Manubot. Writing
+`{placeholder}` into your prompt text will cause it to be replaced with the corresponding value, drawn either
+from the manuscript metadata or from the current file/paragraph being revised.
+
+The following placeholders are available:
+- `{title}`: the title of the manuscript, as defined in the metadata
+- `{keywords}`: comma-delimited keywords from the manuscript metadata
+- `{paragraph_text}`: the text from the current paragraph
+- `{section_name}`: the name of the section (e.g., "introduction", "conclusion"), derived from the filename*
+
+*(\* The mechanism that produces `section_name` is out of the scope of this document, but you can find the implementation in [editor.py](https://github.com/falquaddoomi/manubot-ai-editor/blob/issue-31-customprompts-yaml/libs/manubot_ai_editor/editor.py#L178-L211))*

From de8bb1fea9bfaa2c26f9f627b383f189c70c983b Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Thu, 4 Jul 2024 11:49:36 -0600
Subject: [PATCH 40/44] Added skips for section-specific prompt tests when
 DEFAULT_PROMPT_OVERRIDE is specified, added test for when
 DEFAULT_PROMPT_OVERRIDE is specified.

---
 tests/test_model_get_prompt.py | 65 +++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/tests/test_model_get_prompt.py b/tests/test_model_get_prompt.py
index 538e654..30afdba 100644
--- a/tests/test_model_get_prompt.py
+++ b/tests/test_model_get_prompt.py
@@ -1,9 +1,58 @@
 from unittest import mock
 
 from manubot_ai_editor import env_vars
-from manubot_ai_editor.models import GPT3CompletionModel
+from manubot_ai_editor.models import GPT3CompletionModel, DEFAULT_PROMPT_OVERRIDE
+import pytest
 
+# test decorator to skip section-specific prompt tests when
+# DEFAULT_PROMPT_OVERRIDE is set
+default_prompt_override = pytest.mark.skipif(
+    DEFAULT_PROMPT_OVERRIDE is not None, reason="DEFAULT_PROMPT_OVERRIDE is set, so section-specific prompt tests are off"
+)
+
+# ==============================
+# === default prompt override test
+# ==============================
+
+@pytest.mark.skipif(
+    DEFAULT_PROMPT_OVERRIDE is None,
+    reason="DEFAULT_PROMPT_OVERRIDE is not set, so this test is not needed"
+)
+def test_get_prompt_default_prompt_override():
+    """"
+    Tests that, when DEFAULT_PROMPT_OVERRIDE is set, the prompt matches
+    the default prompt rather than section-specific prompts.
+
+    Note that we test for the explicit text "Proofread the following paragraph..."
+    rather than just checking that it's equal to DEFAULT_PROMPT_OVERRIDE, since
+    we also want to validate that it's the default prompt MP specified.
+    """
+    
+    model = GPT3CompletionModel(
+        title="Test title",
+        keywords=["test", "keywords"],
+    )
+
+    paragraph_text = """
+This is the first sentence.
+And this is the second sentence.
+Finally, the third sentence.
+    """.strip()
 
+    prompt = model.get_prompt(paragraph_text, "introduction")
+    assert prompt.startswith("Proofread the following paragraph that is part of a scientific manuscript.")
+    assert prompt.endswith(paragraph_text[-20:])
+    assert "  " not in prompt
+
+
+# ==============================
+# === section-specific prompt tests
+# ==============================
+
+# since these all rely on the section-specific default prompt resolver, which is
+# disabled when DEFAULT_PROMPT_OVERRIDE is set, we should skip these tests.
+
+@default_prompt_override
 def test_get_prompt_for_abstract():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -28,6 +77,7 @@ def test_get_prompt_for_abstract():
     assert "  " not in prompt
 
 
+@default_prompt_override
 def test_get_prompt_for_abstract_edit_endpoint():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -58,6 +108,7 @@ def test_get_prompt_for_abstract_edit_endpoint():
     assert paragraph_text.strip() == paragraph
 
 
+@default_prompt_override
 def test_get_prompt_for_introduction():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -82,6 +133,7 @@ def test_get_prompt_for_introduction():
     assert "  " not in prompt
 
 
+@default_prompt_override
 def test_get_prompt_section_is_abstract():
     model = GPT3CompletionModel(
         title="Test title",
@@ -101,6 +153,7 @@ def test_get_prompt_section_is_abstract():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_is_introduction():
     model = GPT3CompletionModel(
         title="Test title",
@@ -120,6 +173,7 @@ def test_get_prompt_section_is_introduction():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_is_discussion():
     model = GPT3CompletionModel(
         title="Test title",
@@ -139,6 +193,7 @@ def test_get_prompt_section_is_discussion():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_is_methods():
     model = GPT3CompletionModel(
         title="Test title",
@@ -156,6 +211,7 @@ def test_get_prompt_section_is_methods():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_is_results():
     model = GPT3CompletionModel(
         title="Test title",
@@ -173,6 +229,7 @@ def test_get_prompt_section_is_results():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_not_standard_section():
     model = GPT3CompletionModel(
         title="Test title",
@@ -193,6 +250,7 @@ def test_get_prompt_not_standard_section():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_not_provided():
     model = GPT3CompletionModel(
         title="Test title",
@@ -213,6 +271,7 @@ def test_get_prompt_section_not_provided():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+@default_prompt_override
 def test_get_prompt_section_is_none():
     model = GPT3CompletionModel(
         title="Test title",
@@ -230,6 +289,10 @@ def test_get_prompt_section_is_none():
     assert prompt.endswith(paragraph_text[-20:])
 
 
+# ==============================
+# === custom prompt tests
+# ==============================
+
 @mock.patch.dict(
     "os.environ",
     {env_vars.CUSTOM_PROMPT: "proofread and revise the following paragraph"},

From 6a5dacbd40725dd1af412128620903561781416d Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <falquaddoomi@gmail.com>
Date: Fri, 5 Jul 2024 17:56:45 -0600
Subject: [PATCH 41/44] Update docs/custom-prompts.md

Co-authored-by: Milton Pividori <milton.pividori@cuanschutz.edu>
---
 docs/custom-prompts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/custom-prompts.md b/docs/custom-prompts.md
index 9ed7261..e253de2 100644
--- a/docs/custom-prompts.md
+++ b/docs/custom-prompts.md
@@ -121,6 +121,6 @@ The following placeholders are available:
 - `{title}`: the title of the manuscript, as defined in the metadata
 - `{keywords}`: comma-delimited keywords from the manuscript metadata
 - `{paragraph_text}`: the text from the current paragraph
-- `{section_name}`: the name of the section (e.g., "introduction", "conclusion"), derived from the filename*
+- `{section_name}`: the name of the section (which is one of the following values "abstract",  "introduction", "results", "discussion", "conclusions", "methods" or "supplementary material"), derived from the filename*
 
 *(\* The mechanism that produces `section_name` is out of the scope of this document, but you can find the implementation in [editor.py](https://github.com/falquaddoomi/manubot-ai-editor/blob/issue-31-customprompts-yaml/libs/manubot_ai_editor/editor.py#L178-L211))*

From 8867468abf9dc3da9a98f3293fa016214a85426e Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Fri, 5 Jul 2024 18:08:28 -0600
Subject: [PATCH 42/44] Fleshed out section_name description, removed link to
 personal repo

---
 docs/custom-prompts.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/custom-prompts.md b/docs/custom-prompts.md
index e253de2..ba6f453 100644
--- a/docs/custom-prompts.md
+++ b/docs/custom-prompts.md
@@ -121,6 +121,11 @@ The following placeholders are available:
 - `{title}`: the title of the manuscript, as defined in the metadata
 - `{keywords}`: comma-delimited keywords from the manuscript metadata
 - `{paragraph_text}`: the text from the current paragraph
-- `{section_name}`: the name of the section (which is one of the following values "abstract",  "introduction", "results", "discussion", "conclusions", "methods" or "supplementary material"), derived from the filename*
-
-*(\* The mechanism that produces `section_name` is out of the scope of this document, but you can find the implementation in [editor.py](https://github.com/falquaddoomi/manubot-ai-editor/blob/issue-31-customprompts-yaml/libs/manubot_ai_editor/editor.py#L178-L211))*
+- `{section_name}`: the name of the section (which is one of the following values "abstract",  "introduction", "results", "discussion", "conclusions", "methods" or "supplementary material"), derived from the filename.
+
+The `section_name` placeholder works like so:
+- if the env var `AI_EDITOR_FILENAME_SECTION_MAPPING` is specified, it will be interpreted as a dictionary mapping filenames to section names.
+If a key of the dictionary is included in the filename, the value will be used as the section name.
+Also the keys and values can be any string, not just one of the section names mentioned before.
+- If the dict mentioned above is unset or the filename doesn't match any of its keys, the filename will be matched against the following values: "introduction", "methods", "results", "discussion", "conclusions" or "supplementary".
+If the values are contained within the filename, the section name will be mapped to that value. "supplementary" is replaced with "supplementary material", but the others are used as is.

From 31e83632ec6612e9b77538d455537823c551f102 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Mon, 8 Jul 2024 12:17:17 -0600
Subject: [PATCH 43/44] Mock-patches DEFAULT_PROMPT_OVERRIDE to None for tests
 that need it be to none to run, rather than just skipping them if it's not
 none

---
 tests/test_model_get_prompt.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/test_model_get_prompt.py b/tests/test_model_get_prompt.py
index 30afdba..9b02e4a 100644
--- a/tests/test_model_get_prompt.py
+++ b/tests/test_model_get_prompt.py
@@ -10,6 +10,10 @@
     DEFAULT_PROMPT_OVERRIDE is not None, reason="DEFAULT_PROMPT_OVERRIDE is set, so section-specific prompt tests are off"
 )
 
+# test patcher that sets DEFAULT_PROMPT_OVERRIDE to None, so that the
+# tests that are invalidated by it being set can still run
+reset_default_prompt_override = mock.patch("manubot_ai_editor.models.DEFAULT_PROMPT_OVERRIDE", None)
+
 # ==============================
 # === default prompt override test
 # ==============================
@@ -52,7 +56,7 @@ def test_get_prompt_default_prompt_override():
 # since these all rely on the section-specific default prompt resolver, which is
 # disabled when DEFAULT_PROMPT_OVERRIDE is set, we should skip these tests.
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_for_abstract():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -77,7 +81,7 @@ def test_get_prompt_for_abstract():
     assert "  " not in prompt
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_for_abstract_edit_endpoint():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -108,7 +112,7 @@ def test_get_prompt_for_abstract_edit_endpoint():
     assert paragraph_text.strip() == paragraph
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_for_introduction():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -133,7 +137,7 @@ def test_get_prompt_for_introduction():
     assert "  " not in prompt
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_abstract():
     model = GPT3CompletionModel(
         title="Test title",
@@ -153,7 +157,7 @@ def test_get_prompt_section_is_abstract():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_introduction():
     model = GPT3CompletionModel(
         title="Test title",
@@ -173,7 +177,7 @@ def test_get_prompt_section_is_introduction():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_discussion():
     model = GPT3CompletionModel(
         title="Test title",
@@ -193,7 +197,7 @@ def test_get_prompt_section_is_discussion():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_methods():
     model = GPT3CompletionModel(
         title="Test title",
@@ -211,7 +215,7 @@ def test_get_prompt_section_is_methods():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_results():
     model = GPT3CompletionModel(
         title="Test title",
@@ -229,7 +233,7 @@ def test_get_prompt_section_is_results():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_not_standard_section():
     model = GPT3CompletionModel(
         title="Test title",
@@ -250,7 +254,7 @@ def test_get_prompt_not_standard_section():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_not_provided():
     model = GPT3CompletionModel(
         title="Test title",
@@ -271,7 +275,7 @@ def test_get_prompt_section_not_provided():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@default_prompt_override
+@reset_default_prompt_override
 def test_get_prompt_section_is_none():
     model = GPT3CompletionModel(
         title="Test title",

From 310e9c46d3f3ebe09c3defe806b8767ff3686496 Mon Sep 17 00:00:00 2001
From: Faisal Alquaddoomi <faisal.alquaddoomi@cuanschutz.edu>
Date: Mon, 8 Jul 2024 16:21:16 -0600
Subject: [PATCH 44/44] Removes DEFAULT_PROMPT_OVERRIDE, reverts tests

---
 libs/manubot_ai_editor/models.py | 24 -----------
 tests/test_model_get_prompt.py   | 69 +-------------------------------
 2 files changed, 1 insertion(+), 92 deletions(-)

diff --git a/libs/manubot_ai_editor/models.py b/libs/manubot_ai_editor/models.py
index 92b22a1..023ff56 100644
--- a/libs/manubot_ai_editor/models.py
+++ b/libs/manubot_ai_editor/models.py
@@ -9,26 +9,6 @@
 
 from manubot_ai_editor import env_vars
 
-# if DEFAULT_PROMPT_OVERRIDE is not None, it's used in get_prompt() instead of the section-specific
-# 'canned' prompts.
-
-# specifically, the prompt resolution order is the following:
-# 1. if a custom prompt is specified via the env var specified by
-#    env_vars.CUSTOM_PROMPT, then the text in that env var is used as the
-#    prompt.
-# 2. if the files ai_revision-config.yaml and/or ai_revision-prompt.yaml are
-#    available, then a prompt resolved from the filename via those config files
-#    is used.
-# 3. if DEFAULT_PROMPT_OVERRIDE (below) is not None, then it is used as the
-#    prompt.
-# 4. if a section_name is specified, then a canned section-specific prompt
-#    matching the section name is used.
-# 5. finally, a generic prompt is constructed and used if none of the above
-#    conditions are met.
-DEFAULT_PROMPT_OVERRIDE = """
-Proofread the following paragraph that is part of a scientific manuscript.
-Keep all Markdown formatting, citations to other articles, mathematical expressions, and equations.
-"""
 
 class ManuscriptRevisionModel(ABC):
     """
@@ -337,10 +317,6 @@ def get_prompt(
             # use the resolved prompt from the ai_revision config files, if available
             # replace placeholders with their actual values
             prompt = resolved_prompt.format(**placeholders)
-        elif DEFAULT_PROMPT_OVERRIDE is not None:
-            # if a default prompt override is specified, use it in favor
-            # of any section-specific prompts
-            prompt = DEFAULT_PROMPT_OVERRIDE.format(**placeholders)
         elif section_name in ("abstract",):
             prompt = f"""
                 Revise the following paragraph from the {section_name} of an academic paper (with the title '{self.title}' and keywords '{", ".join(self.keywords)}')
diff --git a/tests/test_model_get_prompt.py b/tests/test_model_get_prompt.py
index 9b02e4a..538e654 100644
--- a/tests/test_model_get_prompt.py
+++ b/tests/test_model_get_prompt.py
@@ -1,62 +1,9 @@
 from unittest import mock
 
 from manubot_ai_editor import env_vars
-from manubot_ai_editor.models import GPT3CompletionModel, DEFAULT_PROMPT_OVERRIDE
-import pytest
+from manubot_ai_editor.models import GPT3CompletionModel
 
-# test decorator to skip section-specific prompt tests when
-# DEFAULT_PROMPT_OVERRIDE is set
-default_prompt_override = pytest.mark.skipif(
-    DEFAULT_PROMPT_OVERRIDE is not None, reason="DEFAULT_PROMPT_OVERRIDE is set, so section-specific prompt tests are off"
-)
-
-# test patcher that sets DEFAULT_PROMPT_OVERRIDE to None, so that the
-# tests that are invalidated by it being set can still run
-reset_default_prompt_override = mock.patch("manubot_ai_editor.models.DEFAULT_PROMPT_OVERRIDE", None)
-
-# ==============================
-# === default prompt override test
-# ==============================
-
-@pytest.mark.skipif(
-    DEFAULT_PROMPT_OVERRIDE is None,
-    reason="DEFAULT_PROMPT_OVERRIDE is not set, so this test is not needed"
-)
-def test_get_prompt_default_prompt_override():
-    """"
-    Tests that, when DEFAULT_PROMPT_OVERRIDE is set, the prompt matches
-    the default prompt rather than section-specific prompts.
-
-    Note that we test for the explicit text "Proofread the following paragraph..."
-    rather than just checking that it's equal to DEFAULT_PROMPT_OVERRIDE, since
-    we also want to validate that it's the default prompt MP specified.
-    """
-    
-    model = GPT3CompletionModel(
-        title="Test title",
-        keywords=["test", "keywords"],
-    )
 
-    paragraph_text = """
-This is the first sentence.
-And this is the second sentence.
-Finally, the third sentence.
-    """.strip()
-
-    prompt = model.get_prompt(paragraph_text, "introduction")
-    assert prompt.startswith("Proofread the following paragraph that is part of a scientific manuscript.")
-    assert prompt.endswith(paragraph_text[-20:])
-    assert "  " not in prompt
-
-
-# ==============================
-# === section-specific prompt tests
-# ==============================
-
-# since these all rely on the section-specific default prompt resolver, which is
-# disabled when DEFAULT_PROMPT_OVERRIDE is set, we should skip these tests.
-
-@reset_default_prompt_override
 def test_get_prompt_for_abstract():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -81,7 +28,6 @@ def test_get_prompt_for_abstract():
     assert "  " not in prompt
 
 
-@reset_default_prompt_override
 def test_get_prompt_for_abstract_edit_endpoint():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -112,7 +58,6 @@ def test_get_prompt_for_abstract_edit_endpoint():
     assert paragraph_text.strip() == paragraph
 
 
-@reset_default_prompt_override
 def test_get_prompt_for_introduction():
     manuscript_title = "Title of the manuscript to be revised"
     manuscript_keywords = ["keyword0", "keyword1", "keyword2"]
@@ -137,7 +82,6 @@ def test_get_prompt_for_introduction():
     assert "  " not in prompt
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_abstract():
     model = GPT3CompletionModel(
         title="Test title",
@@ -157,7 +101,6 @@ def test_get_prompt_section_is_abstract():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_introduction():
     model = GPT3CompletionModel(
         title="Test title",
@@ -177,7 +120,6 @@ def test_get_prompt_section_is_introduction():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_discussion():
     model = GPT3CompletionModel(
         title="Test title",
@@ -197,7 +139,6 @@ def test_get_prompt_section_is_discussion():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_methods():
     model = GPT3CompletionModel(
         title="Test title",
@@ -215,7 +156,6 @@ def test_get_prompt_section_is_methods():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_results():
     model = GPT3CompletionModel(
         title="Test title",
@@ -233,7 +173,6 @@ def test_get_prompt_section_is_results():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_not_standard_section():
     model = GPT3CompletionModel(
         title="Test title",
@@ -254,7 +193,6 @@ def test_get_prompt_not_standard_section():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_not_provided():
     model = GPT3CompletionModel(
         title="Test title",
@@ -275,7 +213,6 @@ def test_get_prompt_section_not_provided():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-@reset_default_prompt_override
 def test_get_prompt_section_is_none():
     model = GPT3CompletionModel(
         title="Test title",
@@ -293,10 +230,6 @@ def test_get_prompt_section_is_none():
     assert prompt.endswith(paragraph_text[-20:])
 
 
-# ==============================
-# === custom prompt tests
-# ==============================
-
 @mock.patch.dict(
     "os.environ",
     {env_vars.CUSTOM_PROMPT: "proofread and revise the following paragraph"},