diff --git a/README.md b/README.md index 60e9522..7753121 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ # HDMF-AI - an HDMF schema and API for AI/ML workflows -![Schema](schema.png) +![Schema](paper/schema.png) diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..d9f551d --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,124 @@ +@inproceedings{tritt2019hdmf, + title={HDMF: hierarchical data modeling framework for modern science data standards}, + author={Tritt, Andrew J and R{\"u}bel, Oliver and Dichter, Benjamin and Ly, Ryan and Kang, Donghe and Chang, Edward F and Frank, Loren M and Bouchard, Kristofer}, + booktitle={2019 IEEE International Conference on Big Data (Big Data)}, + pages={165--179}, + year={2019}, + organization={IEEE}, + doi={10.1109/BigData47090.2019.9005648} +} + +@article{belthangady2019applications, + title={Applications, promises, and pitfalls of deep learning for fluorescence image reconstruction}, + author={Belthangady, Chinmay and Royer, Loic A}, + journal={Nature methods}, + volume={16}, + number={12}, + pages={1215--1225}, + year={2019}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-019-0458-z} +} + +@article{wilkinson2016fair, + title={The FAIR Guiding Principles for scientific data management and stewardship}, + author={Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others}, + journal={Scientific data}, + volume={3}, + number={1}, + pages={1--9}, + year={2016}, + publisher={Nature Publishing Group}, + doi={10.1038/sdata.2016.18} +} + +@article{rubel2022neurodata, + title={The Neurodata Without Borders ecosystem for neurophysiological data science}, + author={R{\"u}bel, Oliver and Tritt, Andrew and Ly, Ryan and Dichter, Benjamin K and Ghosh, Satrajit and Niu, Lawrence and Baker, Pamela and Soltesz, Ivan and Ng, Lydia and Svoboda, Karel and others}, + journal={Elife}, + volume={11}, + pages={e78362}, + year={2022}, + publisher={eLife Sciences Publications Limited}, + doi={10.7554/eLife.78362} +} + +@article{huerta2023fair, + title={FAIR for AI: An interdisciplinary and international community building perspective}, + author={Huerta, EA and Blaiszik, Ben and Brinson, L Catherine and Bouchard, Kristofer E and Diaz, Daniel and Doglioni, Caterina and Duarte, Javier M and Emani, Murali and Foster, Ian and Fox, Geoffrey and others}, + journal={Scientific data}, + volume={10}, + number={1}, + pages={487}, + year={2023}, + publisher={Nature Publishing Group UK London} +} + +@article{goble2020fair, + title={FAIR computational workflows}, + author={Goble, Carole and Cohen-Boulakia, Sarah and Soiland-Reyes, Stian and Garijo, Daniel and Gil, Yolanda and Crusoe, Michael R and Peters, Kristian and Schober, Daniel}, + journal={Data Intelligence}, + volume={2}, + number={1-2}, + pages={108--121}, + year={2020}, + publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} +} + +@misc{souza2019provenance, + title={Provenance Data in the Machine Learning Lifecycle in Computational Science and Engineering}, + author={Renan Souza and Leonardo Azevedo and Vítor Lourenço and Elton Soares and Raphael Thiago and Rafael Brandão and Daniel Civitarese and Emilio Vital Brazil and Marcio Moreno and Patrick Valduriez and Marta Mattoso and Renato Cerqueira and Marco A. S. Netto}, + year={2019}, + eprint={1910.04223}, + archivePrefix={arXiv}, + primaryClass={cs.DC} +} + +@software{hdf5, + author = {{The HDF Group}}, + title = {{Hierarchical Data Format, version 5}}, + url = {https://github.com/HDFGroup/hdf5} +} + +@software{zarr, + author = {Alistair Miles and + jakirkham and + M Bussonnier and + Josh Moore and + Dimitri Papadopoulos Orfanos and + James Bourbeau and + Andrew Fulton and + Davis Bennett and + Gregory Lee and + Sanket Verma and + Zain Patel and + Ryan Abernathey and + David Stansby and + Mads R. B. Kristensen and + Matthew Rocklin and + AWA BRANDON AWA and + Joe Hamman and + Saransh Chopra and + Elliott Sales de Andrade and + Martin Durant and + Vincent Schut and + raphael dussin and + Juan Nunez-Iglesias and + Chris Barnes and + Shivank Chaudhary and + shikharsg and + hailiangzhang and + Weddy Gikunda}, + title = {zarr-developers/zarr-python: v2.17.1}, + year = 2024, + publisher = {Zenodo}, + doi = {10.5281/zenodo.3773449}, + url = {https://doi.org/10.5281/zenodo.3773449} +} + +@software{Tritt_deep-taxon, + author = {Tritt, Andrew J}, + license = {BSD-3-Clause-LBNL}, + title = {{deep-taxon}}, + url = {https://github.com/exabiome/deep-taxon} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..f2cf9e5 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,63 @@ +--- +title: 'HDMF-AI: A schema and API for storing the results from AI/ML workflows' +tags: + - artificial intelligence + - machine learning + - data standards + - data management + - data modeling + - deep learning + - scientific data + - scientific machine learning + +authors: + - name: Ryan Ly + orcid: 0000-0001-9238-0642 + affiliation: 1 + corresponding: true + - name: Andrew Tritt + orcid: 0000-0002-1617-449X + affiliation: 2 + - name: Marcin Joachimiak + orcid: 0000-0001-8175-045X + affiliation: 3 + - name: Kris Bouchard + orcid: 0000-0002-1974-4603 + affiliation: "1, 4, 5, 6" +affiliations: + - name: Scientific Data Division, Lawrence Berkeley National Laboratory, USA + index: 1 + - name: Applied Mathematics and Computational Research Division, Lawrence Berkeley National Laboratory, USA + index: 2 + - name: Biosystems Data Science Department, Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, USA + index: 3 + - name: Biological Systems & Engineering Division, Lawrence Berkeley National Laboratory, USA + index: 4 + - name: Helen Wills Neuroscience Institute, UC Berkeley, USA + index: 5 + - name: Redwood Center for Theoretical Neuroscience, UC Berkeley, USA + index: 6 +date: 11 April 2024 +bibliography: paper.bib + +--- + +# Summary + +Scientists are increasingly using artificial intelligence (AI) methods that learn directly from data to make new discoveries in complex systems across multiple domains. However, the lack of standardized data and models in the scientific community hinders the reproducibility and reusability of these methods and their results [@huerta2023fair] [@goble2020fair]. Here, we present `HDMF-AI`, a schema and API for storing the common results of AI algorithms in a standardized way. `HDMF-AI` is designed to be flexible and extensible, allowing users to store a range of AI results. These results can be directly linked to the model training data, which enables greater understanding of how models solved the task and more comprehensive analysis of errors. `HDMF-AI` provides users with a convenient programming interface for reading and writing AI results, with powerful options to optimize storage space and data transfer. By using `HDMF-AI`, scientists can easily make their results available and share them with others, helping to ensure that their work is reproducible and reusable. + +# Statement of Need + +Modern AI approaches, such as deep learning, are powerful at uncovering subtle structure in complex datasets that are informative for solving a task. These approaches can also discover structures that may be scientifically artefactual [@belthangady2019applications]. For example, there may be relationships between the data acquisition protocols and the collected data, and deep learning could potentially utilize such "nuisance variables" when solving the task. Thus, to trust the results of AI algorithms, we must understand what data features/samples a trained model is utilizing to solve the task, and link that to metadata about those samples to interpret and evaluate the basis of results. Many solutions exist for provenance tracking of AI/ML workflows, e.g., [@souza2019provenance]; however, these solutions are designed for production settings and are difficult to use in exploratory analysis. Although many scientific communities have standardized formats for sharing self-describing data, the AI community has no standard format that connects data and models. The adoption of AI by scientists hinges on making data, models, and workflows FAIR (Findable, Accessible, Interoperable, Reusable) [@wilkinson2016fair] and cross-referenceable to each other to maximize interpretability, reproducibility, and reusability. + +`HDMF-AI` is a schema and Python API for storing the common results of AI algorithms in a standardized way within the Hierarchical Data Modeling Framework (HDMF) [@tritt2019hdmf]. `HDMF-AI` is designed to be flexible and extensible, allowing users to store a range of AI and machine learning results and metadata, such as from classification, regression, and clustering. These results are stored in the `ResultsTable` data type, which extends the `DynamicTable` data type within the base HDMF schema. The `DynamicTable` schema supports simple tabular data as well as more complex structures common in scientific data, such as ragged arrays, n-dimensional arrays, and enumerations. The `ResultsTable` schema represents each data sample as a row and includes columns for storing model outputs and information about the AI/ML workflow, such as which data were used for training, validation, and testing. These columns are represented as new data types in the schema to allow extension and composition in other data types (see \autoref{fig:schema}). By extending `DynamicTable`, the `ResultsTable` allows the user to add arbitrary columns, enabling the storage of non-standardized metadata and AI outputs, such as performance metrics, alongside the standardized columns. The `ResultsTable` schema also supports a direct link to data stored in another `DynamicTable`, enabling the user to associate AI results with the original data. This link allows for greater understanding of how models are completing the task and analysis of any associated errors. The schema also supports the storage of model parameters and links to the source code(s) used to train the model, as well as links to publicly available pre-trained models if they were used. + +Using the HDMF API, the `ResultsTable` can easily be added to datasets that follow an HDMF-based standard, such as Neurodata Without Borders [@rubel2022neurodata], a popular data standard for neurophysiology, and HDMF-Seq, a format for storing taxonomic and genomic sequence data [@Tritt_deep-taxon]. HDMF provides core functionality that allows `HDMF-AI` users to store AI results using advanced features and options for efficient storage and access, such as chunking, compression, and selective streaming from an S3 bucket. Users can write results to an HDF5 file, a popular file format for scientific data and high-performance computing [@hdf5], or a Zarr store, a new format optimized for cloud computing [@zarr]. By leveraging existing HDMF tools and standards, `HDMF-AI` provides a scalable and extensible framework for storing AI results in an accessible, standardized way that is compatible with other HDMF-based data formats. By enabling standardized co-storage of data and AI results, `HDMF-AI` may enhance the reproducibility and explainability of AI for science. + +![UML diagram of the HDMF-AI schema. Data types with orange headers are introduced by HDMF-AI. Data types with blue headers are defined in HDMF. Fields colored in gray are optional.\label{fig:schema}](schema.png) + +# Acknowledgements + +This work is part of the ENDURABLE project supported by the Advanced Scientific Computing Research (ASCR) program in the U.S. Department of Energy, Office of Science, Office of Biological and Environmental Research (BER) [DE-AC02-05CH11231 to LBNL]. + +# References diff --git a/schema.png b/paper/schema.png similarity index 100% rename from schema.png rename to paper/schema.png