make_overviews.py

"""
Creates the overview markdown files.
"""
import bibtexparser
import os
from typing import TextIO

from dev.markdown import entry_to_markdown
import dev.names as names


def make_overview(document_structure: dict, grouped_entries: dict,
                  title: str, outfile: str):
    """
    Creates an overview filed formatted as markdown.
    The `document_structure` dictates the structure of the document.
    It is a dictionary of categories mapping to its respective subcategory
    structure.
    A given category can be a tuple. In this case, all entries corresponding
    to either of the groups will be included in the section. The first entry
    in the tuple is used for naming the section.
    If the subcategory structure is empty, None, or False,
    the section will not further divide the categories' entries into
    subsections.

    The `grouped_entries` is a dictionary where the keys are the categories
    mapping onto the entries that belong to the category.

    The `title` is the title of the document. It will be printed as the first
    line as main heading.

    The `outfile` is the file to which the output will be written.
    """
    with open(outfile, 'w') as out:
        out.write(f'# {title}\n\n')

        out.write('<!-- This file is automatically generated. Do not edit. -->\n\n')

        _make_toc(document_structure, 0, out)
        out.write('\n')

        _write_section(document_structure, grouped_entries, 2, out)


def _make_toc(doc_structure: dict, nesting_depth: int, out: TextIO):
    nesting_prefix = '  ' * nesting_depth
    for section, sub_sections in doc_structure.items():
        if not isinstance(section, tuple):
            section = (section,)

        section_name = names.translation.get(section[0], section[0])
        toc_link = _github_toc_link(section_name)

        out.write(f'{nesting_prefix}* [{section_name}](#{toc_link})\n')

        if sub_sections:
            _make_toc(sub_sections, nesting_depth + 1, out)
    ...


def _github_toc_link(section_name: str) -> str:
    """
    Turns string to lower case, replaces spaces with dashes,
    and removes non-alphanumeric characters.
    """
    section_name = section_name.lower().replace(' ', '-')
    section_name = ''.join(c for c in section_name if c.isalnum() or c == '-')
    return section_name


def _write_section(section_structure: dict, grouped_entries: dict,
                   heading_level: int, out: TextIO):
    heading_prefix = '#' * heading_level

    for subsection, sub_structure in section_structure.items():
        # A section can be a tuple or a string; we unify this to all tuples.
        if not isinstance(subsection, tuple):
            subsection = (subsection,)

        # The first entry in the tuple is the name of the section.
        section_name = names.translation.get(subsection[0], subsection[0])

        out.write(f'{heading_prefix} {section_name}\n\n')

        if sub_structure:
            _write_section(sub_structure, grouped_entries,
                           heading_level + 1, out)
        else:
            entries = []
            for category in subsection:
                entries += grouped_entries.get(category, [])
            # Sort by year (descending) and then by ID (ascending).
            # Negative year forces descending order for first key.
            entries.sort(key=lambda e: (-int(e['year']), e['ID']))
            for entry in entries:
                out.write('* ' + entry_to_markdown(entry))
                out.write('\n')
            out.write('\n')


if __name__ == '__main__':
    with open('database.bib') as f:
        bib = bibtexparser.load(f)

    # Sort the entries by group.
    group_entries = {}
    for entry in bib.entries:
        groups = entry.get('groups', '').split(',')
        for group in groups:
            group = group.strip()
            group_entries.setdefault(group, []).append(entry)
    # Sort the entries by year.
    year_entries = {}
    for entry in bib.entries:
        year = entry['year']
        year_entries.setdefault(year, []).append(entry)

    ai_groups = {
        ('trees', 'ai-mul-trees'): {},
        ('randomforest', 'ai-mul-randomforest'): {},
        ('svm', 'ai-mul-svm'): {},
        ('knn', 'ai-mul-knn'): {},
        'ai-mul-lr': {},
        ('neuralnetworks', 'ai-mul-neuralnetworks'): {},
        'reinforcement-learning': {},
        ('genetic', 'ai-mul-genetic'): {},
        ('nlp', 'ai-mul-llm'): {},
        'automatonlearning': {},
        'baysianinference': {},
        ('clustering', 'ai-mul-clustering'): {},
        ('datamining', 'ai-mul-datamining'): {},
        'ai-mul-naive': {},
        # 'ai-multiple': {},
        'ai-custom': {},
        'ai-other': {},
    }

    fm_groups = {
        'sat': {
            'sat-prediction': [],
            ('sat-solving', 'sat-multi-solving'): [],
            ('sat-portfolio', 'sat-multi-algorithmselection'): [],
            ('sat-maxsat', 'sat-multi-maxsat'): [],
            'sat-varselection': [],
            'sat-branching': [],
            'sat-generation': [],
            'sat-parameter': [],
            'sat-multi-modelcounting': [],
            'sat-dependency': [],
            'sat-meta': [],
        },
        'smt': {
            'smt-solverselection': [],
            'smt-quantifier': [],
            'smt-quality': [],
        },
        'tp': {
            'tp-portfolio': [],
            ('tp-tacticsprediction', 'tp-mul-tacticsprediction'): [],
            'tp-formulaclassification': [],
            ('tp-axiomselection', 'tp-mul-axiomselection'): [],
            ('tp-proofsearch', 'tp-mul-proofsearch'): [],
            'tp-proofmining': [],
            'tp-mul-proofrewrite': [],
            ('tp-proofsynthesis', 'tp-mul-synthesis'): [],
            'tp-formulasynthesis': [],
            'tp-symbolic': [],
            'tp-mul-symbolguessing': [],

            'tp-heuristicselection': [],

            'tp-mul-positionprediction': [],

            'tp-lemmaname': [],
        },
        'modelchecking': [],
        'synthesis': {
            'synthesis-invariant': [],
            'synthesis-loopinvariant': [],
            'synthesis-repair': [],
            'synthesis-specification': [],
            'synthesis-annotations': [],
        },
        'other': [],
    }

    os.makedirs('overview', exist_ok=True)
    make_overview(ai_groups, group_entries,
                  'Overview of used AI techniques from 2019-2023',
                  'overview/ai-techniques_2019-2023.md')
    make_overview(fm_groups, group_entries,
                  'Overview of used FM techniques from 2019-2023',
                  'overview/fm-techniques_2019-2023.md')

    year_overview = {y_str: [] for y in range(2023, 1971, -1)
                     if (y_str := str(y)) in year_entries}
    make_overview(year_overview, year_entries,
                  'Overview of found primary studies by year',
                  'overview/all_by_year.md')

    data_sets = {
        'dataset-sat': [],
        'dataset-tp': [],
        'dataset-modelchecking': [],
        'dataset-synthesis': [],
        'dataset-smt': [],
    }
    make_overview(data_sets, group_entries,
                  'Overview of found data sets for applied ML onto FM from 2019-2023',
                  'overview/data-sets_2019-2023.md')