diff --git a/main/404.html b/main/404.html
index cbdf37d..f4fcbf0 100644
--- a/main/404.html
+++ b/main/404.html
@@ -931,6 +931,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/valence-labs/openQDC/main/API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/valence-labs/openQDC/main/API/datasets/alchemy.html" class="md-nav__link">
         
@@ -994,6 +1015,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/valence-labs/openQDC/main/API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/valence-labs/openQDC/main/API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/basedataset.html b/main/API/basedataset.html
index 178b9f5..64cc3df 100644
--- a/main/API/basedataset.html
+++ b/main/API/basedataset.html
@@ -1200,6 +1200,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1263,6 +1284,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/3bpa.html b/main/API/datasets/3bpa.html
new file mode 100644
index 0000000..db660a1
--- /dev/null
+++ b/main/API/datasets/3bpa.html
@@ -0,0 +1,2271 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Harness the power of quantum chemistry in one line of code.">
+      
+      
+      
+        <link rel="canonical" href="https://github.com/valence-labs/openQDC/main/API/datasets/3bpa.html">
+      
+      
+        <link rel="prev" href="../formats.html">
+      
+      
+        <link rel="next" href="alchemy.html">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.33">
+    
+    
+      
+        <title>3BPA - OpenQDC</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+      <link rel="stylesheet" href="../../css/custom.css">
+    
+      <link rel="stylesheet" href="../../css/custom-openqdc.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+  
+
+
+  
+  
+
+<script id="__analytics">function __md_analytics(){function n(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],n("js",new Date),n("config","G-3ESBDCXFTZ"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){this.value&&n("event","search",{search_term:this.value})}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");n("event","feedback",{page:t,data:e}),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){n("config","G-3ESBDCXFTZ",{page_path:e.pathname})})});var e=document.createElement("script");e.async=!0,e.src="https://www.googletagmanager.com/gtag/js?id=G-3ESBDCXFTZ",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script>
+  
+    <script>"undefined"!=typeof __md_analytics&&__md_analytics()</script>
+  
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#openqdc.datasets.potential.bpa" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+      <div data-md-color-scheme="default" data-md-component="outdated" hidden>
+        
+      </div>
+    
+    
+      
+
+<header class="md-header" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../../index.html" title="OpenQDC" class="md-header__button md-logo" aria-label="OpenQDC" data-md-component="logo">
+      
+  <img src="../../assets/qdc_logo.png" alt="logo">
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            OpenQDC
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              3BPA
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/valence-labs/openQDC" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    openQDC
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+            
+<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
+  <div class="md-grid">
+    <ul class="md-tabs__list">
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../index.html" class="md-tabs__link">
+        
+  
+    
+  
+  Overview
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../usage.html" class="md-tabs__link">
+          
+  
+  Usage
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../datasets.html" class="md-tabs__link">
+        
+  
+    
+  
+  Available Datasets
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../normalization_e0s.html" class="md-tabs__link">
+        
+  
+    
+  
+  QM methods
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../data_storage.html" class="md-tabs__link">
+        
+  
+    
+  
+  Data structure
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../tutorials/usage.html" class="md-tabs__link">
+          
+  
+  Tutorials
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+    
+  
+  
+    
+    
+      <li class="md-tabs__item md-tabs__item--active">
+        <a href="../methods.html" class="md-tabs__link">
+          
+  
+  API
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../contribute.html" class="md-tabs__link">
+          
+  
+  Contribute
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../licensing.html" class="md-tabs__link">
+        
+  
+    
+  
+  License
+
+      </a>
+    </li>
+  
+
+      
+    </ul>
+  </div>
+</nav>
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+  
+
+
+<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../../index.html" title="OpenQDC" class="md-nav__button md-logo" aria-label="OpenQDC" data-md-component="logo">
+      
+  <img src="../../assets/qdc_logo.png" alt="logo">
+
+    </a>
+    OpenQDC
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/valence-labs/openQDC" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    openQDC
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../index.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
+        
+          
+          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Usage
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_2">
+            <span class="md-nav__icon md-icon"></span>
+            Usage
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../usage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Base usage
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../cli.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    CLI
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available Datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../normalization_e0s.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM methods
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../data_storage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Data structure
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
+        
+          
+          <label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Tutorials
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_6">
+            <span class="md-nav__icon md-icon"></span>
+            Tutorials
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/usage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    OpenQDC Hands-on Tutorial
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+        
+        
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="">
+            
+  
+  <span class="md-ellipsis">
+    API
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7">
+            <span class="md-nav__icon md-icon"></span>
+            API
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../methods.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM methods
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../regressor.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Normalization regressor
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_3" >
+        
+          
+          <label class="md-nav__link" for="__nav_7_3" id="__nav_7_3_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Main classes
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_7_3_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_7_3">
+            <span class="md-nav__icon md-icon"></span>
+            Main classes
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../basedataset.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../properties.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available Properties
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../e0_dispatcher.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    e0 Dispatcher
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../statistics.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Statistics
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../formats.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Format loading
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7_5" id="__nav_7_5_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Datasets
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_7_5_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7_5">
+            <span class="md-nav__icon md-icon"></span>
+            Datasets
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5_1" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7_5_1" id="__nav_7_5_1_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Potential Energy
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="3" aria-labelledby="__nav_7_5_1_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7_5_1">
+            <span class="md-nav__icon md-icon"></span>
+            Potential Energy
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="3bpa.html" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.bpa" class="md-nav__link">
+    <span class="md-ellipsis">
+      bpa
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.bpa.BPA" class="md-nav__link">
+    <span class="md-ellipsis">
+      BPA
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="alchemy.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Alchemy
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="ani.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    ANI
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="spice.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Spice
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="geom.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    GEOM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qmugs.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Qmugs
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="iso_17.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    ISO_17
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="comp6.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Comp6
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="gdml.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    GDML
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="molecule3d.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Molecule3D
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="orbnet_denali.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Orbnet Denali
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="sn2_rxn.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    SN2 RXN
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qm7x.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM7X
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qm1b.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM1B
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="nabladft.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    NablaDFT
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="solvated_peptides.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Solvated Peptides
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="waterclusters3_30.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Waterclusters3_30
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="waterclusters.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    SCAN Waterclusters
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="tmqm.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    TMQM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="pcqm.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    PCQM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="revmd17.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    RevMD17
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="md22.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MD22
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="transition1x.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Transition1X
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="multixcqm9.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MultixcQM9
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qmx.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QMX
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="proteinfragments.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Protein Fragments
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="vqm24.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    VQM24
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5_2" >
+        
+          
+          <label class="md-nav__link" for="__nav_7_5_2" id="__nav_7_5_2_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Interaction Energy
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="3" aria-labelledby="__nav_7_5_2_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_7_5_2">
+            <span class="md-nav__icon md-icon"></span>
+            Interaction Energy
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="des.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    DES
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="l7.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    L7
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="x40.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    X40
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="metcalf.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Metcalf
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="splinter.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Splinter
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../units.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Units
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../utils.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Utils
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        
+          
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Contribute
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
+            <span class="md-nav__icon md-icon"></span>
+            Contribute
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../contribute.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Mantaining
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../dataset_upload.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Add a dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../licensing.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    License
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.bpa" class="md-nav__link">
+    <span class="md-ellipsis">
+      bpa
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.bpa.BPA" class="md-nav__link">
+    <span class="md-ellipsis">
+      BPA
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+  <h1>3BPA</h1>
+
+<div class="doc doc-object doc-module">
+
+
+
+<a id="openqdc.datasets.potential.bpa"></a>
+    <div class="doc doc-contents first">
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<h2 id="openqdc.datasets.potential.bpa.BPA" class="doc doc-heading">
+            <code>BPA</code>
+
+
+<a href="#openqdc.datasets.potential.bpa.BPA" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+            <p class="doc doc-class-bases">
+              Bases: <code><a class="autorefs autorefs-internal" title="openqdc.BaseDataset" href="../basedataset.html#openqdc.datasets.base.BaseDataset">BaseDataset</a></code></p>
+
+
+      <p>BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike
+molecule 3-(benzyloxy)pyridin-2-amine. This dataset features
+complex dihedral potential energy surface with many local minima,
+which can be challenging to approximate using classical or ML force fields.
+The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to
+perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at
+three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.
+The final configurations were re-evaluated using ORCA at the DFT level of
+theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.</p>
+<p>Usage:
+<div class="highlight"><pre><span></span><code><span class="kn">from</span> <span class="nn">openqdc.datasets</span> <span class="kn">import</span> <span class="n">BPA</span>
+<span class="n">dataset</span> <span class="o">=</span> <span class="n">BPA</span><span class="p">()</span>
+</code></pre></div></p>
+
+
+<details class="references" open>
+  <summary>References</summary>
+  <p><a href="https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647">https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647</a></p>
+</details>
+              <details class="quote">
+                <summary>Source code in <code>openqdc/datasets/potential/bpa.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">22</span>
+<span class="normal">23</span>
+<span class="normal">24</span>
+<span class="normal">25</span>
+<span class="normal">26</span>
+<span class="normal">27</span>
+<span class="normal">28</span>
+<span class="normal">29</span>
+<span class="normal">30</span>
+<span class="normal">31</span>
+<span class="normal">32</span>
+<span class="normal">33</span>
+<span class="normal">34</span>
+<span class="normal">35</span>
+<span class="normal">36</span>
+<span class="normal">37</span>
+<span class="normal">38</span>
+<span class="normal">39</span>
+<span class="normal">40</span>
+<span class="normal">41</span>
+<span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span>
+<span class="normal">47</span>
+<span class="normal">48</span>
+<span class="normal">49</span>
+<span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span>
+<span class="normal">65</span>
+<span class="normal">66</span>
+<span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span>
+<span class="normal">71</span>
+<span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">BPA</span><span class="p">(</span><span class="n">BaseDataset</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike</span>
+<span class="sd">    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features</span>
+<span class="sd">    complex dihedral potential energy surface with many local minima,</span>
+<span class="sd">    which can be challenging to approximate using classical or ML force fields.</span>
+<span class="sd">    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to</span>
+<span class="sd">    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at</span>
+<span class="sd">    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.</span>
+<span class="sd">    The final configurations were re-evaluated using ORCA at the DFT level of</span>
+<span class="sd">    theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.</span>
+
+<span class="sd">    Usage:</span>
+<span class="sd">    ```python</span>
+<span class="sd">    from openqdc.datasets import BPA</span>
+<span class="sd">    dataset = BPA()</span>
+<span class="sd">    ```</span>
+
+
+<span class="sd">    References:</span>
+<span class="sd">        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="vm">__name__</span> <span class="o">=</span> <span class="s2">&quot;BPA&quot;</span>
+    <span class="n">__energy_unit__</span> <span class="o">=</span> <span class="s2">&quot;ev&quot;</span>
+    <span class="n">__forces_unit__</span> <span class="o">=</span> <span class="s2">&quot;ev/ang&quot;</span>
+    <span class="n">__distance_unit__</span> <span class="o">=</span> <span class="s2">&quot;ang&quot;</span>
+    <span class="n">__force_mask__</span> <span class="o">=</span> <span class="p">[</span><span class="kc">True</span><span class="p">]</span>
+    <span class="n">__energy_methods__</span> <span class="o">=</span> <span class="p">[</span><span class="n">PotentialMethod</span><span class="o">.</span><span class="n">WB97X_6_31G_D</span><span class="p">]</span>
+    <span class="n">__links__</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;BPA.zip&quot;</span><span class="p">:</span> <span class="s2">&quot;https://figshare.com/ndownloader/files/31325990&quot;</span><span class="p">}</span>
+
+    <span class="k">def</span> <span class="nf">read_raw_entries</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">]:</span>
+        <span class="kn">import</span> <span class="nn">os.path</span> <span class="k">as</span> <span class="nn">osp</span>
+        <span class="kn">from</span> <span class="nn">glob</span> <span class="kn">import</span> <span class="n">glob</span>
+
+        <span class="kn">from</span> <span class="nn">ase.io</span> <span class="kn">import</span> <span class="n">iread</span>
+
+        <span class="n">files</span> <span class="o">=</span> <span class="n">glob</span><span class="p">(</span><span class="n">osp</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">root</span><span class="p">,</span> <span class="s2">&quot;dataset_3BPA&quot;</span><span class="p">,</span> <span class="s2">&quot;*.xyz&quot;</span><span class="p">))</span>
+        <span class="n">files</span> <span class="o">=</span> <span class="p">[</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">files</span> <span class="k">if</span> <span class="s2">&quot;iso_atoms.xyz&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">f</span><span class="p">]</span>
+        <span class="n">all_records</span> <span class="o">=</span> <span class="p">[]</span>
+
+        <span class="k">for</span> <span class="n">file</span> <span class="ow">in</span> <span class="n">files</span><span class="p">:</span>
+            <span class="n">subset</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">osp</span><span class="o">.</span><span class="n">basename</span><span class="p">(</span><span class="n">file</span><span class="p">)</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]])</span>
+
+            <span class="k">for</span> <span class="n">atoms</span> <span class="ow">in</span> <span class="n">iread</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;extxyz&quot;</span><span class="p">):</span>
+                <span class="n">all_records</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">read_bpa_record</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="n">atoms</span><span class="p">))</span>
+
+        <span class="k">return</span> <span class="n">all_records</span>
+
+    <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">):</span>
+        <span class="n">data</span> <span class="o">=</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span>
+        <span class="n">data</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="s2">&quot;split&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_convert_array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="s2">&quot;split&quot;</span><span class="p">][</span><span class="n">idx</span><span class="p">]))</span>
+        <span class="k">return</span> <span class="n">data</span>
+</code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+  
+
+
+
+  <form class="md-feedback" name="feedback" hidden>
+    <fieldset>
+      <legend class="md-feedback__title">
+        Was this page helpful?
+      </legend>
+      <div class="md-feedback__inner">
+        <div class="md-feedback__list">
+          
+            <button class="md-feedback__icon md-icon" type="submit" title="This page was helpful" data-md-value="1">
+              <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 12a8 8 0 0 0-8-8 8 8 0 0 0-8 8 8 8 0 0 0 8 8 8 8 0 0 0 8-8m2 0a10 10 0 0 1-10 10A10 10 0 0 1 2 12 10 10 0 0 1 12 2a10 10 0 0 1 10 10M10 9.5c0 .8-.7 1.5-1.5 1.5S7 10.3 7 9.5 7.7 8 8.5 8s1.5.7 1.5 1.5m7 0c0 .8-.7 1.5-1.5 1.5S14 10.3 14 9.5 14.7 8 15.5 8s1.5.7 1.5 1.5m-5 7.73c-1.75 0-3.29-.73-4.19-1.81L9.23 14c.45.72 1.52 1.23 2.77 1.23s2.32-.51 2.77-1.23l1.42 1.42c-.9 1.08-2.44 1.81-4.19 1.81Z"/></svg>
+            </button>
+          
+            <button class="md-feedback__icon md-icon" type="submit" title="This page could be improved" data-md-value="0">
+              <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 12a8 8 0 0 0-8-8 8 8 0 0 0-8 8 8 8 0 0 0 8 8 8 8 0 0 0 8-8m2 0a10 10 0 0 1-10 10A10 10 0 0 1 2 12 10 10 0 0 1 12 2a10 10 0 0 1 10 10m-6.5-4c.8 0 1.5.7 1.5 1.5s-.7 1.5-1.5 1.5-1.5-.7-1.5-1.5.7-1.5 1.5-1.5M10 9.5c0 .8-.7 1.5-1.5 1.5S7 10.3 7 9.5 7.7 8 8.5 8s1.5.7 1.5 1.5m2 4.5c1.75 0 3.29.72 4.19 1.81l-1.42 1.42C14.32 16.5 13.25 16 12 16s-2.32.5-2.77 1.23l-1.42-1.42C8.71 14.72 10.25 14 12 14Z"/></svg>
+            </button>
+          
+        </div>
+        <div class="md-feedback__note">
+          
+            <div data-md-value="1" hidden>
+              
+              
+                
+              
+              Thanks for your feedback!
+            </div>
+          
+            <div data-md-value="0" hidden>
+              
+              
+                
+              
+              Thanks for your feedback!
+            </div>
+          
+        </div>
+      </div>
+    </fieldset>
+  </form>
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+    <div class="md-copyright__highlight">
+      Copyright 2024 Valence Labs
+    </div>
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": {"provider": "mike"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.af256bd8.min.js"></script>
+      
+        <script src="../../javascripts/config.js"></script>
+      
+        <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/main/API/datasets/alchemy.html b/main/API/datasets/alchemy.html
index afc962c..7f33abd 100644
--- a/main/API/datasets/alchemy.html
+++ b/main/API/datasets/alchemy.html
@@ -13,7 +13,7 @@
         <link rel="canonical" href="https://github.com/valence-labs/openQDC/main/API/datasets/alchemy.html">
       
       
-        <link rel="prev" href="../formats.html">
+        <link rel="prev" href="3bpa.html">
       
       
         <link rel="next" href="ani.html">
@@ -951,6 +951,27 @@
                 
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
     
   
   
@@ -1070,6 +1091,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/ani.html b/main/API/datasets/ani.html
index 3460e7c..861c30f 100644
--- a/main/API/datasets/ani.html
+++ b/main/API/datasets/ani.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1106,6 +1127,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/comp6.html b/main/API/datasets/comp6.html
index 4810f4f..12048c5 100644
--- a/main/API/datasets/comp6.html
+++ b/main/API/datasets/comp6.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/des.html b/main/API/datasets/des.html
index 853ea36..2a5300d 100644
--- a/main/API/datasets/des.html
+++ b/main/API/datasets/des.html
@@ -951,6 +951,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/gdml.html b/main/API/datasets/gdml.html
index 58e8232..f6a0b33 100644
--- a/main/API/datasets/gdml.html
+++ b/main/API/datasets/gdml.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/geom.html b/main/API/datasets/geom.html
index 6610a54..dd7b324 100644
--- a/main/API/datasets/geom.html
+++ b/main/API/datasets/geom.html
@@ -13,7 +13,7 @@
         <link rel="canonical" href="https://github.com/valence-labs/openQDC/main/API/datasets/geom.html">
       
       
-        <link rel="prev" href="spice.html">
+        <link rel="prev" href="maceoff.html">
       
       
         <link rel="next" href="qmugs.html">
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
                 
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
     
   
   
diff --git a/main/API/datasets/iso_17.html b/main/API/datasets/iso_17.html
index 8a39159..7e4e37f 100644
--- a/main/API/datasets/iso_17.html
+++ b/main/API/datasets/iso_17.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/l7.html b/main/API/datasets/l7.html
index a1c0dcd..6c2a92c 100644
--- a/main/API/datasets/l7.html
+++ b/main/API/datasets/l7.html
@@ -951,6 +951,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/maceoff.html b/main/API/datasets/maceoff.html
new file mode 100644
index 0000000..322920d
--- /dev/null
+++ b/main/API/datasets/maceoff.html
@@ -0,0 +1,2294 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Harness the power of quantum chemistry in one line of code.">
+      
+      
+      
+        <link rel="canonical" href="https://github.com/valence-labs/openQDC/main/API/datasets/maceoff.html">
+      
+      
+        <link rel="prev" href="spice.html">
+      
+      
+        <link rel="next" href="geom.html">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.33">
+    
+    
+      
+        <title>MaceOFF - OpenQDC</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+      <link rel="stylesheet" href="../../css/custom.css">
+    
+      <link rel="stylesheet" href="../../css/custom-openqdc.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+  
+
+
+  
+  
+
+<script id="__analytics">function __md_analytics(){function n(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],n("js",new Date),n("config","G-3ESBDCXFTZ"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){this.value&&n("event","search",{search_term:this.value})}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");n("event","feedback",{page:t,data:e}),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){n("config","G-3ESBDCXFTZ",{page_path:e.pathname})})});var e=document.createElement("script");e.async=!0,e.src="https://www.googletagmanager.com/gtag/js?id=G-3ESBDCXFTZ",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script>
+  
+    <script>"undefined"!=typeof __md_analytics&&__md_analytics()</script>
+  
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#openqdc.datasets.potential.maceoff" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+      <div data-md-color-scheme="default" data-md-component="outdated" hidden>
+        
+      </div>
+    
+    
+      
+
+<header class="md-header" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../../index.html" title="OpenQDC" class="md-header__button md-logo" aria-label="OpenQDC" data-md-component="logo">
+      
+  <img src="../../assets/qdc_logo.png" alt="logo">
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            OpenQDC
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              MaceOFF
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/valence-labs/openQDC" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    openQDC
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+            
+<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
+  <div class="md-grid">
+    <ul class="md-tabs__list">
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../index.html" class="md-tabs__link">
+        
+  
+    
+  
+  Overview
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../usage.html" class="md-tabs__link">
+          
+  
+  Usage
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../datasets.html" class="md-tabs__link">
+        
+  
+    
+  
+  Available Datasets
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../normalization_e0s.html" class="md-tabs__link">
+        
+  
+    
+  
+  QM methods
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../data_storage.html" class="md-tabs__link">
+        
+  
+    
+  
+  Data structure
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../tutorials/usage.html" class="md-tabs__link">
+          
+  
+  Tutorials
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+    
+  
+  
+    
+    
+      <li class="md-tabs__item md-tabs__item--active">
+        <a href="../methods.html" class="md-tabs__link">
+          
+  
+  API
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../contribute.html" class="md-tabs__link">
+          
+  
+  Contribute
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../../licensing.html" class="md-tabs__link">
+        
+  
+    
+  
+  License
+
+      </a>
+    </li>
+  
+
+      
+    </ul>
+  </div>
+</nav>
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+  
+
+
+<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../../index.html" title="OpenQDC" class="md-nav__button md-logo" aria-label="OpenQDC" data-md-component="logo">
+      
+  <img src="../../assets/qdc_logo.png" alt="logo">
+
+    </a>
+    OpenQDC
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/valence-labs/openQDC" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    openQDC
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../index.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
+        
+          
+          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Usage
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_2">
+            <span class="md-nav__icon md-icon"></span>
+            Usage
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../usage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Base usage
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../cli.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    CLI
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available Datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../normalization_e0s.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM methods
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../data_storage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Data structure
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
+        
+          
+          <label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Tutorials
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_6">
+            <span class="md-nav__icon md-icon"></span>
+            Tutorials
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/usage.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    OpenQDC Hands-on Tutorial
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+        
+        
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="">
+            
+  
+  <span class="md-ellipsis">
+    API
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7">
+            <span class="md-nav__icon md-icon"></span>
+            API
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../methods.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM methods
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../regressor.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Normalization regressor
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_3" >
+        
+          
+          <label class="md-nav__link" for="__nav_7_3" id="__nav_7_3_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Main classes
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_7_3_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_7_3">
+            <span class="md-nav__icon md-icon"></span>
+            Main classes
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../basedataset.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../properties.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available Properties
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../e0_dispatcher.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    e0 Dispatcher
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../statistics.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Statistics
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../formats.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Format loading
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7_5" id="__nav_7_5_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Datasets
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_7_5_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7_5">
+            <span class="md-nav__icon md-icon"></span>
+            Datasets
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5_1" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_7_5_1" id="__nav_7_5_1_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Potential Energy
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="3" aria-labelledby="__nav_7_5_1_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_7_5_1">
+            <span class="md-nav__icon md-icon"></span>
+            Potential Energy
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="alchemy.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Alchemy
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="ani.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    ANI
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="spice.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Spice
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="maceoff.html" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.maceoff" class="md-nav__link">
+    <span class="md-ellipsis">
+      maceoff
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.maceoff.MACEOFF" class="md-nav__link">
+    <span class="md-ellipsis">
+      MACEOFF
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="geom.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    GEOM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qmugs.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Qmugs
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="iso_17.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    ISO_17
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="comp6.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Comp6
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="gdml.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    GDML
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="molecule3d.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Molecule3D
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="orbnet_denali.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Orbnet Denali
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="sn2_rxn.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    SN2 RXN
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qm7x.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM7X
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qm1b.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QM1B
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="nabladft.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    NablaDFT
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="solvated_peptides.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Solvated Peptides
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="waterclusters3_30.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Waterclusters3_30
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="waterclusters.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    SCAN Waterclusters
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="tmqm.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    TMQM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="pcqm.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    PCQM
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="revmd17.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    RevMD17
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="md22.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MD22
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="transition1x.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Transition1X
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="multixcqm9.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MultixcQM9
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="qmx.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    QMX
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="proteinfragments.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Protein Fragments
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="vqm24.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    VQM24
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7_5_2" >
+        
+          
+          <label class="md-nav__link" for="__nav_7_5_2" id="__nav_7_5_2_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Interaction Energy
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="3" aria-labelledby="__nav_7_5_2_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_7_5_2">
+            <span class="md-nav__icon md-icon"></span>
+            Interaction Energy
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="des.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    DES
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="l7.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    L7
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="x40.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    X40
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="metcalf.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Metcalf
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="splinter.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Splinter
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../units.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Units
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../utils.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Utils
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        
+          
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Contribute
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
+            <span class="md-nav__icon md-icon"></span>
+            Contribute
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../contribute.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Mantaining
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../dataset_upload.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Add a dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../licensing.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    License
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.maceoff" class="md-nav__link">
+    <span class="md-ellipsis">
+      maceoff
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#openqdc.datasets.potential.maceoff.MACEOFF" class="md-nav__link">
+    <span class="md-ellipsis">
+      MACEOFF
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+  <h1>MaceOFF</h1>
+
+<div class="doc doc-object doc-module">
+
+
+
+<a id="openqdc.datasets.potential.maceoff"></a>
+    <div class="doc doc-contents first">
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<h2 id="openqdc.datasets.potential.maceoff.MACEOFF" class="doc doc-heading">
+            <code>MACEOFF</code>
+
+
+<a href="#openqdc.datasets.potential.maceoff.MACEOFF" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+            <p class="doc doc-class-bases">
+              Bases: <code><a class="autorefs autorefs-internal" title="openqdc.datasets.base.BaseDataset" href="../basedataset.html#openqdc.datasets.base.BaseDataset">BaseDataset</a></code></p>
+
+
+      <p>MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
+95% of the data are used for training and validation under the "train" split,
+and 5% for testing. The dataset uses the Spice level of theory
+ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
+MACEOFF uses a subset of SPICE that contains the ten chemical elements
+H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
+MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
+non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
+randomly selected from the QMugs dataset.
+MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
+of liquid water, with sizes of up to 50 water molecules and part of the
+COMP6 tripeptide geometry dataset.</p>
+<p>Usage:
+<div class="highlight"><pre><span></span><code><span class="kn">from</span> <span class="nn">openqdc.datasets</span> <span class="kn">import</span> <span class="n">MACEOFF</span>
+<span class="n">dataset</span> <span class="o">=</span> <span class="n">MACEOFF</span><span class="p">()</span>
+</code></pre></div></p>
+
+
+<details class="species" open>
+  <summary>Species</summary>
+  <p>[H, C, N, O, F, P, S, Cl, Br, I]</p>
+</details>
+
+<details class="references" open>
+  <summary>References</summary>
+  <p><a href="https://arxiv.org/pdf/2312.15211">https://arxiv.org/pdf/2312.15211</a></p>
+<p><a href="https://doi.org/10.17863/CAM.107498">https://doi.org/10.17863/CAM.107498</a></p>
+</details>
+              <details class="quote">
+                <summary>Source code in <code>openqdc/datasets/potential/maceoff.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 74</span>
+<span class="normal"> 75</span>
+<span class="normal"> 76</span>
+<span class="normal"> 77</span>
+<span class="normal"> 78</span>
+<span class="normal"> 79</span>
+<span class="normal"> 80</span>
+<span class="normal"> 81</span>
+<span class="normal"> 82</span>
+<span class="normal"> 83</span>
+<span class="normal"> 84</span>
+<span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
+<span class="normal">115</span>
+<span class="normal">116</span>
+<span class="normal">117</span>
+<span class="normal">118</span>
+<span class="normal">119</span>
+<span class="normal">120</span>
+<span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">MACEOFF</span><span class="p">(</span><span class="n">BaseDataset</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.</span>
+<span class="sd">    95% of the data are used for training and validation under the &quot;train&quot; split,</span>
+<span class="sd">    and 5% for testing. The dataset uses the Spice level of theory</span>
+<span class="sd">    ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.</span>
+<span class="sd">    MACEOFF uses a subset of SPICE that contains the ten chemical elements</span>
+<span class="sd">    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.</span>
+<span class="sd">    MACEOFF doesn&#39;t contain ion pairs. To facilitate the learning of intramolecular</span>
+<span class="sd">    non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules</span>
+<span class="sd">    randomly selected from the QMugs dataset.</span>
+<span class="sd">    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations</span>
+<span class="sd">    of liquid water, with sizes of up to 50 water molecules and part of the</span>
+<span class="sd">    COMP6 tripeptide geometry dataset.</span>
+
+<span class="sd">    Usage:</span>
+<span class="sd">    ```python</span>
+<span class="sd">    from openqdc.datasets import MACEOFF</span>
+<span class="sd">    dataset = MACEOFF()</span>
+<span class="sd">    ```</span>
+
+<span class="sd">    Species:</span>
+<span class="sd">        [H, C, N, O, F, P, S, Cl, Br, I]</span>
+
+<span class="sd">    References:</span>
+<span class="sd">        https://arxiv.org/pdf/2312.15211\n</span>
+<span class="sd">        https://doi.org/10.17863/CAM.107498</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="vm">__name__</span> <span class="o">=</span> <span class="s2">&quot;maceoff&quot;</span>
+
+    <span class="n">__energy_methods__</span> <span class="o">=</span> <span class="p">[</span><span class="n">PotentialMethod</span><span class="o">.</span><span class="n">WB97M_D3BJ_DEF2_TZVPPD</span><span class="p">]</span>
+    <span class="n">__force_mask__</span> <span class="o">=</span> <span class="p">[</span><span class="kc">True</span><span class="p">]</span>
+    <span class="n">__energy_unit__</span> <span class="o">=</span> <span class="s2">&quot;ev&quot;</span>
+    <span class="n">__distance_unit__</span> <span class="o">=</span> <span class="s2">&quot;ang&quot;</span>
+    <span class="n">__forces_unit__</span> <span class="o">=</span> <span class="s2">&quot;ev/ang&quot;</span>
+
+    <span class="n">energy_target_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;dft_total_energy&quot;</span><span class="p">]</span>
+    <span class="n">force_target_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;dft_total_gradient&quot;</span><span class="p">]</span>
+
+    <span class="n">__links__</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="s2">&quot;train_large_neut_no_bad_clean.tar.gz&quot;</span><span class="p">:</span> <span class="s2">&quot;https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content&quot;</span><span class="p">,</span>  <span class="c1"># noqa: E501</span>
+        <span class="s2">&quot;test_large_neut_all.tar.gz&quot;</span><span class="p">:</span> <span class="s2">&quot;https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content&quot;</span><span class="p">,</span>  <span class="c1"># noqa: E501</span>
+    <span class="p">}</span>
+
+    <span class="k">def</span> <span class="nf">read_raw_entries</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+        <span class="n">entries</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="k">for</span> <span class="n">filename</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__links__</span><span class="p">:</span>
+            <span class="n">filename</span> <span class="o">=</span> <span class="n">filename</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="n">xyzpath</span> <span class="o">=</span> <span class="n">p_join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">root</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">filename</span><span class="si">}</span><span class="s2">.xyz&quot;</span><span class="p">)</span>
+            <span class="n">split</span> <span class="o">=</span> <span class="n">filename</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;_&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="n">structure_iterator</span> <span class="o">=</span> <span class="n">parse_mace_xyz</span><span class="p">(</span><span class="n">xyzpath</span><span class="p">)</span>
+            <span class="n">func</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">build_data_object</span><span class="p">,</span> <span class="n">split</span><span class="o">=</span><span class="n">split</span><span class="p">)</span>
+            <span class="n">entries</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">parallelized</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">structure_iterator</span><span class="p">))</span>
+        <span class="k">return</span> <span class="n">entries</span>
+
+    <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">):</span>
+        <span class="n">data</span> <span class="o">=</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span>
+        <span class="n">data</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="s2">&quot;split&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_convert_array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="s2">&quot;split&quot;</span><span class="p">][</span><span class="n">idx</span><span class="p">]))</span>
+        <span class="k">return</span> <span class="n">data</span>
+</code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+  
+
+
+
+  <form class="md-feedback" name="feedback" hidden>
+    <fieldset>
+      <legend class="md-feedback__title">
+        Was this page helpful?
+      </legend>
+      <div class="md-feedback__inner">
+        <div class="md-feedback__list">
+          
+            <button class="md-feedback__icon md-icon" type="submit" title="This page was helpful" data-md-value="1">
+              <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 12a8 8 0 0 0-8-8 8 8 0 0 0-8 8 8 8 0 0 0 8 8 8 8 0 0 0 8-8m2 0a10 10 0 0 1-10 10A10 10 0 0 1 2 12 10 10 0 0 1 12 2a10 10 0 0 1 10 10M10 9.5c0 .8-.7 1.5-1.5 1.5S7 10.3 7 9.5 7.7 8 8.5 8s1.5.7 1.5 1.5m7 0c0 .8-.7 1.5-1.5 1.5S14 10.3 14 9.5 14.7 8 15.5 8s1.5.7 1.5 1.5m-5 7.73c-1.75 0-3.29-.73-4.19-1.81L9.23 14c.45.72 1.52 1.23 2.77 1.23s2.32-.51 2.77-1.23l1.42 1.42c-.9 1.08-2.44 1.81-4.19 1.81Z"/></svg>
+            </button>
+          
+            <button class="md-feedback__icon md-icon" type="submit" title="This page could be improved" data-md-value="0">
+              <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 12a8 8 0 0 0-8-8 8 8 0 0 0-8 8 8 8 0 0 0 8 8 8 8 0 0 0 8-8m2 0a10 10 0 0 1-10 10A10 10 0 0 1 2 12 10 10 0 0 1 12 2a10 10 0 0 1 10 10m-6.5-4c.8 0 1.5.7 1.5 1.5s-.7 1.5-1.5 1.5-1.5-.7-1.5-1.5.7-1.5 1.5-1.5M10 9.5c0 .8-.7 1.5-1.5 1.5S7 10.3 7 9.5 7.7 8 8.5 8s1.5.7 1.5 1.5m2 4.5c1.75 0 3.29.72 4.19 1.81l-1.42 1.42C14.32 16.5 13.25 16 12 16s-2.32.5-2.77 1.23l-1.42-1.42C8.71 14.72 10.25 14 12 14Z"/></svg>
+            </button>
+          
+        </div>
+        <div class="md-feedback__note">
+          
+            <div data-md-value="1" hidden>
+              
+              
+                
+              
+              Thanks for your feedback!
+            </div>
+          
+            <div data-md-value="0" hidden>
+              
+              
+                
+              
+              Thanks for your feedback!
+            </div>
+          
+        </div>
+      </div>
+    </fieldset>
+  </form>
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+    <div class="md-copyright__highlight">
+      Copyright 2024 Valence Labs
+    </div>
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": {"provider": "mike"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.af256bd8.min.js"></script>
+      
+        <script src="../../javascripts/config.js"></script>
+      
+        <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/main/API/datasets/md22.html b/main/API/datasets/md22.html
index 65426e6..108327e 100644
--- a/main/API/datasets/md22.html
+++ b/main/API/datasets/md22.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/metcalf.html b/main/API/datasets/metcalf.html
index f4c4092..2a4b562 100644
--- a/main/API/datasets/metcalf.html
+++ b/main/API/datasets/metcalf.html
@@ -951,6 +951,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/molecule3d.html b/main/API/datasets/molecule3d.html
index 3a484e9..51cab42 100644
--- a/main/API/datasets/molecule3d.html
+++ b/main/API/datasets/molecule3d.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/multixcqm9.html b/main/API/datasets/multixcqm9.html
index b6f022c..c086d71 100644
--- a/main/API/datasets/multixcqm9.html
+++ b/main/API/datasets/multixcqm9.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/nabladft.html b/main/API/datasets/nabladft.html
index d20fcc6..757a6be 100644
--- a/main/API/datasets/nabladft.html
+++ b/main/API/datasets/nabladft.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/orbnet_denali.html b/main/API/datasets/orbnet_denali.html
index a7cfb09..8e6755f 100644
--- a/main/API/datasets/orbnet_denali.html
+++ b/main/API/datasets/orbnet_denali.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/pcqm.html b/main/API/datasets/pcqm.html
index cb2e83d..6a3212f 100644
--- a/main/API/datasets/pcqm.html
+++ b/main/API/datasets/pcqm.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/proteinfragments.html b/main/API/datasets/proteinfragments.html
index 56e623e..ab03c1a 100644
--- a/main/API/datasets/proteinfragments.html
+++ b/main/API/datasets/proteinfragments.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/qm1b.html b/main/API/datasets/qm1b.html
index dcf9d91..e5de521 100644
--- a/main/API/datasets/qm1b.html
+++ b/main/API/datasets/qm1b.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/qm7x.html b/main/API/datasets/qm7x.html
index 0030918..2ae17ff 100644
--- a/main/API/datasets/qm7x.html
+++ b/main/API/datasets/qm7x.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/qmugs.html b/main/API/datasets/qmugs.html
index 4321a94..05582a7 100644
--- a/main/API/datasets/qmugs.html
+++ b/main/API/datasets/qmugs.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/qmx.html b/main/API/datasets/qmx.html
index 95d18da..b8dbd56 100644
--- a/main/API/datasets/qmx.html
+++ b/main/API/datasets/qmx.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/revmd17.html b/main/API/datasets/revmd17.html
index 09689b4..bcb1c60 100644
--- a/main/API/datasets/revmd17.html
+++ b/main/API/datasets/revmd17.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/sn2_rxn.html b/main/API/datasets/sn2_rxn.html
index 06ce956..e6508eb 100644
--- a/main/API/datasets/sn2_rxn.html
+++ b/main/API/datasets/sn2_rxn.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/solvated_peptides.html b/main/API/datasets/solvated_peptides.html
index c6f4bae..cc4a2a3 100644
--- a/main/API/datasets/solvated_peptides.html
+++ b/main/API/datasets/solvated_peptides.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/spice.html b/main/API/datasets/spice.html
index eefce69..be7c17b 100644
--- a/main/API/datasets/spice.html
+++ b/main/API/datasets/spice.html
@@ -16,7 +16,7 @@
         <link rel="prev" href="ani.html">
       
       
-        <link rel="next" href="geom.html">
+        <link rel="next" href="maceoff.html">
       
       
       <link rel="icon" href="../../assets/images/favicon.png">
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1097,6 +1118,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/splinter.html b/main/API/datasets/splinter.html
index ab2cdad..0366cde 100644
--- a/main/API/datasets/splinter.html
+++ b/main/API/datasets/splinter.html
@@ -951,6 +951,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/tmqm.html b/main/API/datasets/tmqm.html
index d8e6467..abfaf01 100644
--- a/main/API/datasets/tmqm.html
+++ b/main/API/datasets/tmqm.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/transition1x.html b/main/API/datasets/transition1x.html
index 45ae301..2408e22 100644
--- a/main/API/datasets/transition1x.html
+++ b/main/API/datasets/transition1x.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/vqm24.html b/main/API/datasets/vqm24.html
index 5e347ab..eacc43c 100644
--- a/main/API/datasets/vqm24.html
+++ b/main/API/datasets/vqm24.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/waterclusters.html b/main/API/datasets/waterclusters.html
index 63b6093..808179e 100644
--- a/main/API/datasets/waterclusters.html
+++ b/main/API/datasets/waterclusters.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/waterclusters3_30.html b/main/API/datasets/waterclusters3_30.html
index b65f18b..08b24c5 100644
--- a/main/API/datasets/waterclusters3_30.html
+++ b/main/API/datasets/waterclusters3_30.html
@@ -953,6 +953,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1016,6 +1037,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/datasets/x40.html b/main/API/datasets/x40.html
index e44c928..cced3e4 100644
--- a/main/API/datasets/x40.html
+++ b/main/API/datasets/x40.html
@@ -951,6 +951,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="alchemy.html" class="md-nav__link">
         
@@ -1014,6 +1035,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="geom.html" class="md-nav__link">
         
diff --git a/main/API/e0_dispatcher.html b/main/API/e0_dispatcher.html
index deffe74..7343e17 100644
--- a/main/API/e0_dispatcher.html
+++ b/main/API/e0_dispatcher.html
@@ -1182,6 +1182,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1245,6 +1266,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/formats.html b/main/API/formats.html
index c5ed64f..f14865d 100644
--- a/main/API/formats.html
+++ b/main/API/formats.html
@@ -16,7 +16,7 @@
         <link rel="prev" href="statistics.html">
       
       
-        <link rel="next" href="datasets/alchemy.html">
+        <link rel="next" href="datasets/3bpa.html">
       
       
       <link rel="icon" href="../assets/images/favicon.png">
@@ -1090,6 +1090,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1153,6 +1174,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/methods.html b/main/API/methods.html
index 9e36361..805f508 100644
--- a/main/API/methods.html
+++ b/main/API/methods.html
@@ -1079,6 +1079,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1142,6 +1163,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/properties.html b/main/API/properties.html
index e4ce818..0cb6e9b 100644
--- a/main/API/properties.html
+++ b/main/API/properties.html
@@ -1060,6 +1060,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1123,6 +1144,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/regressor.html b/main/API/regressor.html
index 0b59fce..47c7c01 100644
--- a/main/API/regressor.html
+++ b/main/API/regressor.html
@@ -1096,6 +1096,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1159,6 +1180,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/statistics.html b/main/API/statistics.html
index 8942845..62b8690 100644
--- a/main/API/statistics.html
+++ b/main/API/statistics.html
@@ -1266,6 +1266,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1329,6 +1350,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/units.html b/main/API/units.html
index cfd6d9d..3dccb6e 100644
--- a/main/API/units.html
+++ b/main/API/units.html
@@ -949,6 +949,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1012,6 +1033,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/API/utils.html b/main/API/utils.html
index a6c54bd..585ec61 100644
--- a/main/API/utils.html
+++ b/main/API/utils.html
@@ -949,6 +949,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/alchemy.html" class="md-nav__link">
         
@@ -1012,6 +1033,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="datasets/geom.html" class="md-nav__link">
         
diff --git a/main/cli.html b/main/cli.html
index f156fd8..2131610 100644
--- a/main/cli.html
+++ b/main/cli.html
@@ -1052,6 +1052,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1115,6 +1136,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/contribute.html b/main/contribute.html
index b985cbc..0e70051 100644
--- a/main/contribute.html
+++ b/main/contribute.html
@@ -944,6 +944,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1007,6 +1028,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/data_storage.html b/main/data_storage.html
index 4d8579d..971ed62 100644
--- a/main/data_storage.html
+++ b/main/data_storage.html
@@ -998,6 +998,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1061,6 +1082,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/dataset_upload.html b/main/dataset_upload.html
index 415e98b..bff6a4a 100644
--- a/main/dataset_upload.html
+++ b/main/dataset_upload.html
@@ -944,6 +944,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1007,6 +1028,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/datasets.html b/main/datasets.html
index e51f169..c71bf43 100644
--- a/main/datasets.html
+++ b/main/datasets.html
@@ -954,6 +954,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1017,6 +1038,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/index.html b/main/index.html
index b19c2ee..609bd4a 100644
--- a/main/index.html
+++ b/main/index.html
@@ -1018,6 +1018,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1081,6 +1102,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/licensing.html b/main/licensing.html
index 14095f7..8fbceee 100644
--- a/main/licensing.html
+++ b/main/licensing.html
@@ -937,6 +937,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1000,6 +1021,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/normalization_e0s.html b/main/normalization_e0s.html
index fc489fb..faaecc8 100644
--- a/main/normalization_e0s.html
+++ b/main/normalization_e0s.html
@@ -1026,6 +1026,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1089,6 +1110,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/objects.inv b/main/objects.inv
index 88e5672..1d90554 100644
Binary files a/main/objects.inv and b/main/objects.inv differ
diff --git a/main/search/search_index.json b/main/search/search_index.json
index fc51cd9..263320c 100644
--- a/main/search/search_index.json
+++ b/main/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"<p>OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.</p> <ul> <li>\ud83d\udc0d Simple pythonic API</li> <li>\ud83d\udd79\ufe0f  ML-Ready: all you manipulate are <code>torch.Tensor</code>,<code>jax.Array</code> or <code>numpy.Array</code>objects.</li> <li>\u269b\ufe0f Quantum Ready: The quantum methods are checked and standardized to provide addictional values.</li> <li>\u2705 Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.</li> <li>\ud83e\udde0 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).</li> <li>\ud83d\udcc8 Data: have access to 1.5+ billion datapoints</li> </ul> <p>Visit our website at https://openqdc.io .</p>"},{"location":"index.html#installation","title":"Installation","text":"<p>Use mamba:</p> <pre><code>conda install -c conda-forge openqdc\n</code></pre> <p>Tips: You can replace <code>conda</code> by <code>mamba</code>.</p> <p>Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: <code>pip install openqdc</code>.</p>"},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"<pre><code>from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n</code></pre>"},{"location":"index.html#how-to-cite","title":"How to cite","text":"<p>Please cite OpenQDC if you use it in your research: .</p>"},{"location":"index.html#compatibilities","title":"Compatibilities","text":"<p>OpenQDC is compatible with Python &gt;= 3.8 and is tested on Linux, MacOS and Windows.</p>"},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"<p>You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).</p>"},{"location":"cli.html#datasets","title":"Datasets","text":"<p>Print a formatted table of the available openQDC datasets and some informations.</p> <p>Usage:</p> <pre><code>openqdc datasets [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n</code></pre>"},{"location":"cli.html#cache","title":"Cache","text":"<p>Get the current local cache path of openQDC</p> <p>Usage:</p> <pre><code>openqdc cache [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n</code></pre>"},{"location":"cli.html#download","title":"Download","text":"<p>Download preprocessed ml-ready datasets from the main openQDC hub.</p> <p>Usage:</p> <pre><code>openqdc download DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n</code></pre> <p>Example:</p> <pre><code>openqdc download Spice\n</code></pre>"},{"location":"cli.html#fetch","title":"Fetch","text":"<p>Download the raw datasets files from the main openQDC hub</p> <p>Note:</p> <pre><code>Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n</code></pre> <p>Usage:</p> <pre><code>openqdc fetch DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n</code></pre> <p>Example:</p> <pre><code>openqdc fetch Spice\n</code></pre>"},{"location":"cli.html#preprocess","title":"Preprocess","text":"<p>Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.</p> <p>Usage:</p> <pre><code>openqdc preprocess DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n</code></pre> <p>Example:</p> <pre><code>openqdc preprocess Spice QMugs\n</code></pre>"},{"location":"cli.html#upload","title":"Upload","text":"<p>Upload a preprocessed dataset to the remote storage</p> <p>Usage:</p> <pre><code>openqdc upload DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n</code></pre> <p>Example:</p> <pre><code>openqdc upload Spice --overwrite\n</code></pre>"},{"location":"cli.html#convert","title":"Convert","text":"<p>Convert a preprocessed dataset from a memmap dataset to a zarr dataset.</p> <p>Usage:</p> <pre><code>openqdc convert DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n</code></pre>"},{"location":"contribute.html","title":"Contribute","text":"<p>The below documents the development lifecycle of OpenQDC.</p>"},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"<pre><code>mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n</code></pre>"},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"<pre><code>pre-commit install\npre-commit run --all-files\n</code></pre>"},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"<p>OpenQDC uses Github Actions to:</p> <ul> <li>Build and test <code>openQDC</code>.<ul> <li>Multiple combinations of OS and Python versions are tested.</li> </ul> </li> <li>Check the code:<ul> <li>Formatting with <code>black</code>.</li> <li>Static type check with <code>mypy</code>.</li> <li>Modules import formatting with <code>isort</code>.</li> <li>Pre-commit hooks.</li> </ul> </li> <li>Documentation:<ul> <li>Google docstring format.</li> <li>build and deploy the documentation on <code>main</code> and for every new git tag.</li> </ul> </li> </ul>"},{"location":"contribute.html#run-tests","title":"Run tests","text":"<pre><code>pytest\n</code></pre>"},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"<p>You can build and serve the documentation locally with:</p> <pre><code># Build and serve the doc\nmike serve\n</code></pre> <p>or with</p> <pre><code>mkdocs serve\n</code></pre>"},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"<p>The doc is built for eash push on <code>main</code> and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.</p>"},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"<p>For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:</p> <ul> <li>(M, 5) for atomic numbers (1), charges (1), and positions (3) of individual geometries;</li> </ul> <ul> <li>(N, 2) for the beginning and end indices of each geometry in the previous array;</li> </ul> <ul> <li>(N, ne) for the energy labels of each geometry, extendable to store other geometry-level QM properties such as HOMO-LUMO gap;</li> </ul> <ul> <li>(M, nf , 3) for the force labels of each geometry, extendable to store other atom-level QM properties.</li> </ul> <p>The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.</p> <p></p>"},{"location":"data_storage.html#formats","title":"Formats","text":"<p>We currently support the following formats:</p> <p>1) Zarr : https://zarr.readthedocs.io/en/stable/index.html</p> <p>2) Memmap : https://numpy.org/doc/stable/index.html</p>"},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"<p>Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:</p> <ol> <li>Opening a PR to add a new dataset</li> <li>Request a new dataset through Google Form</li> </ol>"},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"<p>Implement your dataset in the OpenQDC repository by following the guidelines below:</p>"},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":"<ul> <li>The dataset class should be implemented in the <code>openqdc/datasets</code> directory.</li> <li>The dataset class should inherit from the <code>openqdc.datasets.base.BaseDataset</code> class.</li> <li>Add your <code>dataset.py</code> file to the <code>openqdc/datasets/potential</code> or <code>openqdc/datasets/interaction/</code> directory based on the type of energy.</li> <li>Implement the following for your dataset:<ul> <li>Add the metadata of the dataset:<ul> <li>Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.</li> <li><code>__links__</code>: Dictionary of name and link to download the dataset.</li> <li><code>__name__</code>: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.</li> <li>The original units for the dataset <code>__energy_unit__</code> and <code>__distance_unit__</code>.</li> <li><code>__force_mask__</code>: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.</li> <li><code>__energy_methods__</code>: List of the <code>QmMethod</code> methods present in the dataset.</li> </ul> </li> <li><code>read_raw_entries(self)</code> -&gt; <code>List[Dict[str, Any]]</code>: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:<ul> <li><code>atomic_inputs</code> : Atomic inputs of the molecule. numpy.Float32.</li> <li><code>name</code>: Atomic numbers of the atoms in the molecule. numpy.Object.</li> <li><code>subset</code>: Positions of the atoms in the molecule.  numpy.Object.</li> <li><code>energies</code>: Energies of the molecule. numpy.Float64.</li> <li><code>n_atoms</code>: Number of atoms in the molecule. numpy.Int32</li> <li><code>forces</code>: Forces of the molecule. [Optional] numpy.Float32.</li> </ul> </li> <li>Add the dataset import to the <code>openqdc/datasets/&lt;type_of_dataset&gt;/__init__.py</code> file and to <code>openqdc/__init__.py</code>.</li> </ul> </li> </ul>"},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"<p>Try to run the openQDC CLI pipeline with the dataset you implemented.</p> <p>Run the following command to download the dataset:</p> <ul> <li>Fetch the dataset files <pre><code>openqdc fetch DATASET_NAME\n</code></pre></li> </ul> <ul> <li>Preprocess the dataset <pre><code>openqdc preprocess DATASET_NAME\n</code></pre></li> </ul> <ul> <li>Load it on python and check if the dataset is correctly loaded. <pre><code>from openqdc import DATASET_NAME\nds=DATASET_NAME()\n</code></pre></li> </ul> <p>If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.</p> <ul> <li>Select for your PR the <code>dataset</code> label.</li> </ul> <p>Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.</p>"},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"<p>Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here</p> <p>As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.</p>"},{"location":"datasets.html","title":"Overview of Datasets","text":"<p>We provide support for the following publicly available QM Datasets.</p> Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"<pre><code>Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n</code></pre>"},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"<p>OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.</p>"},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"<p>To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies <code>e0</code> for each QM method.</p>"},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"<p>We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies <code>e0</code> physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.</p>"},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"<p><code>e0</code> energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.</p>"},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"<p><code>e0</code> energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The <code>e0</code> energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.</p>"},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"<p>OpenQDC has been designed to be used with a single import:</p> <pre><code>import openqdc as qdc\ndataset = qdc.QM9()\n</code></pre> <p>All <code>openQDC</code> functions are available under <code>qdc</code>. Or if you want to directly import a specific dataset:</p> <pre><code>from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n</code></pre> <p>Or if you prefer handling <code>ase.Atoms</code> objects:</p> <pre><code>dataset.get_ase_atoms(0)\n</code></pre>"},{"location":"usage.html#iterators","title":"Iterators","text":"<p>OpenQDC provides a simple way to get the data as iterators:</p> <pre><code>for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n</code></pre> <p>or if you want to just iterate over the data:</p> <pre><code>for data in dataset:\n    print(data) # dict of arrays\n    break\n</code></pre>"},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"<p>OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during <code>import openqdc as qdc</code>. In case of trouble you can always disable lazy loading by setting the environment variable <code>OPENQDC_DISABLE_LAZY_LOADING</code> to <code>1</code>.</p>"},{"location":"API/basedataset.html","title":"BaseDataset","text":"<p>The BaseDataset defining shared functionality between all datasets.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"<code>BaseDataset</code>","text":"<p>               Bases: <code>DatasetPropertyMixIn</code></p> <p>Base class for datasets in the openQDC package.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -&gt; None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -&gt; None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -&gt; None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -&gt; List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self) -&gt; AtomEnergies:\n        \"\"\"\n        Property to get the object that dispatched the isolated atom energies of the QM methods.\n\n        Returns:\n            Object wrapping the isolated atom energies of the QM methods.\n        \"\"\"\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) &gt; 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # &lt; 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -&gt; Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -&gt; bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -&gt; bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -&gt; Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 &lt; n_samples &lt; 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -&gt; Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -&gt; Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -&gt; Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"<code>__force_methods__</code>  <code>property</code>","text":"<p>For backward compatibility. To be removed in the future.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.e0s_dispatcher","title":"<code>e0s_dispatcher: AtomEnergies</code>  <code>property</code>","text":"<p>Property to get the object that dispatched the isolated atom energies of the QM methods.</p> <p>Returns:</p> Type Description <code>AtomEnergies</code> <p>Object wrapping the isolated atom energies of the QM methods.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"<code>energy_methods: List[str]</code>  <code>property</code>","text":"<p>Return the string version of the energy methods</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"<code>__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})</code>","text":"<p>Parameters:</p> Name Type Description Default <code>energy_unit</code> <code>Optional[str]</code> <p>Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]</p> <code>None</code> <code>distance_unit</code> <code>Optional[str]</code> <p>Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]</p> <code>None</code> <code>array_format</code> <code>str</code> <p>Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]</p> <code>'numpy'</code> <code>energy_type</code> <code>Optional[str]</code> <p>Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]</p> <code>'formation'</code> <code>overwrite_local_cache</code> <code>bool</code> <p>Whether to overwrite the locally cached dataset.</p> <code>False</code> <code>cache_dir</code> <code>Optional[str]</code> <p>Cache directory location. Defaults to \"~/.cache/openqdc\"</p> <code>None</code> <code>recompute_statistics</code> <code>bool</code> <p>Whether to recompute the statistics of the dataset.</p> <code>False</code> <code>transform</code> <code>Optional[Callable]</code> <p>transformation to apply to the getitem calls</p> <code>None</code> <code>regressor_kwargs</code> <code>Dict</code> <p>Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]</p> <code>{'solver_type': 'linear', 'sub_sample': None, 'stride': 1}</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -&gt; None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"<code>as_iter(atoms=False, energy_method=0)</code>","text":"<p>Return the dataset as an iterator.</p> <p>Parameters:</p> Name Type Description Default <code>atoms</code> <code>bool</code> <p>Whether to return the items as ASE atoms object, by default False</p> <code>False</code> <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <p>Returns:</p> Type Description <code>Iterable</code> <p>Iterator of the dataset</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def as_iter(self, atoms: bool = False, energy_method: int = 0) -&gt; Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"<code>calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)</code>","text":"<p>Compute the descriptors for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>descriptor_name</code> <code>str</code> <p>Name of the descriptor to use. Supported descriptors are [\"soap\"]</p> <code>'soap'</code> <code>chemical_species</code> <code>Optional[List[str]]</code> <p>List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.</p> <code>None</code> <code>n_samples</code> <code>Optional[Union[List[int], int, float]]</code> <p>Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.</p> <code>None</code> <code>progress</code> <code>bool</code> <p>Whether to show a progress bar, by default True.</p> <code>True</code> <code>**descriptor_kwargs</code> <p>dict Keyword arguments to pass to the descriptor instantiation of the model.</p> <code>{}</code> <p>Returns:</p> Type Description <code>Dict[str, ndarray]</code> <p>Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>@requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -&gt; Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"<code>collate_list(list_entries)</code>","text":"<p>Collate a list of entries into a single dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>list_entries</code> <code>List[Dict]</code> <p>List of dictionaries containing the entries to collate.</p> required <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary containing the collated entries.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def collate_list(self, list_entries: List[Dict]) -&gt; Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"<code>get_ase_atoms(idx, energy_method=0, ext=True)</code>","text":"<p>Get the ASE atoms object for the entry at index idx.</p> <p>Parameters:</p> Name Type Description Default <code>idx</code> <code>int</code> <p>Index of the entry.</p> required <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>ext</code> <code>bool</code> <p>Whether to include additional informations</p> <code>True</code> <p>Returns:</p> Type Description <code>Atoms</code> <p>ASE atoms object</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -&gt; Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"<code>get_statistics(return_none=True)</code>","text":"<p>Get the converted statistics of the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>return_none</code> <p>Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0</p> <code>True</code> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary containing the statistics of the dataset</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def get_statistics(self, return_none: bool = True) -&gt; Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"<code>is_cached()</code>","text":"<p>Check if the dataset is cached locally.</p> <p>Returns:</p> Type Description <code>bool</code> <p>True if the dataset is cached locally, False otherwise.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def is_cached(self) -&gt; bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"<code>is_preprocessed()</code>","text":"<p>Check if the dataset is preprocessed and available online or locally.</p> <p>Returns:</p> Type Description <code>bool</code> <p>True if the dataset is available remotely or locally, False otherwise.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def is_preprocessed(self) -&gt; bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"<code>no_init()</code>  <code>classmethod</code>","text":"<p>Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>@classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"<code>preprocess(upload=False, overwrite=True, as_zarr=True)</code>","text":"<p>Preprocess the dataset and save it.</p> <p>Parameters:</p> Name Type Description Default <code>upload</code> <code>bool</code> <p>Whether to upload the preprocessed data to the remote storage or only saving it locally.</p> <code>False</code> <code>overwrite</code> <code>bool</code> <p>hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.</p> <code>True</code> <code>as_zarr</code> <code>bool</code> <p>Whether to save the data as zarr files</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"<code>read_raw_entries()</code>","text":"<p>Preprocess the raw (aka from the fetched source) into a list of dictionaries.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"<code>save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)</code>","text":"<p>Save the preprocessed data to the cache directory and optionally upload it to the remote storage.</p> <p>Parameters:</p> Name Type Description Default <code>data_dict</code> <code>Dict[str, ndarray]</code> <p>Dictionary containing the preprocessed data.</p> required <code>upload</code> <code>bool</code> <p>Whether to upload the preprocessed data to the remote storage or only saving it locally.</p> <code>False</code> <code>overwrite</code> <code>bool</code> <p>Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"<code>save_xyz(idx, energy_method=0, path=None, ext=True)</code>","text":"<p>Save a single entry at index idx as an extxyz file.</p> <p>Parameters:</p> Name Type Description Default <code>idx</code> <code>int</code> <p>Index of the entry</p> required <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>path</code> <code>Optional[str]</code> <p>Path to save the xyz file. If None, the current working directory is used.</p> <code>None</code> <code>ext</code> <code>bool</code> <p>Whether to include additional informations like forces and other metadatas (extxyz format)</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"<code>set_distance_unit(value)</code>","text":"<p>Set a new distance unit for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>value</code> <code>str</code> <p>New distance unit to set.</p> required Source code in <code>openqdc/datasets/base.py</code> <pre><code>def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"<code>set_energy_unit(value)</code>","text":"<p>Set a new energy unit for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>value</code> <code>str</code> <p>New energy unit to set.</p> required Source code in <code>openqdc/datasets/base.py</code> <pre><code>def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"<code>to_xyz(energy_method=0, path=None)</code>","text":"<p>Save dataset as single xyz file (extended xyz format).</p> <p>Parameters:</p> Name Type Description Default <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>path</code> <code>Optional[str]</code> <p>Path to save the xyz file</p> <code>None</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"<code>upload(overwrite=False, as_zarr=False)</code>","text":"<p>Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.</p> <p>Parameters:</p> Name Type Description Default <code>overwrite</code> <code>bool</code> <p>Whether to overwrite the remote data if it already exists</p> <code>False</code> <code>as_zarr</code> <code>bool</code> <p>Whether to upload the data as zarr files</p> <code>False</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n</code></pre>"},{"location":"API/e0_dispatcher.html","title":"e0 Dispatcher","text":""},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies","title":"<code>AtomEnergies</code>","text":"<p>Manager class for interface with the isolated atom energies classes and providing the generals function to retrieve the data</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class AtomEnergies:\n    \"\"\"\n    Manager class for interface with the isolated atom energies classes\n    and providing the generals function to retrieve the data\n    \"\"\"\n\n    def __init__(self, data, **kwargs) -&gt; None:\n        self.atom_energies = data.energy_type\n        self.factory = dispatch_factory(data, **kwargs)\n\n    @property\n    def e0s_matrix(self) -&gt; np.ndarray:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_matrix\n\n    @property\n    def e0s_dict(self) -&gt; Dict[AtomSpecies, AtomEnergy]:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_dict\n\n    def __str__(self):\n        return f\"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}\"\n\n    def __repr__(self):\n        return str(self)\n\n    def __getitem__(self, item: AtomSpecies) -&gt; AtomEnergy:\n        \"\"\"\n        Retrieve a key from the isolated atom dictionary.\n        Item can be written as tuple(Symbol, charge),\n        tuple(Chemical number, charge). If no charge is passed,\n        it will be automatically set to 0.\n\n        Examples:\n            AtomEnergies[6], AtomEnergies[6,1], \\n\n            AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n            AtomEnergies[(\"C,1)]\n\n        Parameters:\n            item:\n                AtomSpecies object or tuple with the atom symbol and charge\n\n        Returns:\n            AtomEnergy object with the isolated atom energy\n        \"\"\"\n        try:\n            atom, charge = item[0], item[1]\n        except TypeError:\n            atom = item\n            charge = 0\n        except IndexError:\n            atom = item[0]\n            charge = 0\n        if not isinstance(atom, str):\n            atom = ATOM_SYMBOLS[atom]\n        return self.e0s_dict[(atom, charge)]\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_dict","title":"<code>e0s_dict: Dict[AtomSpecies, AtomEnergy]</code>  <code>property</code>","text":"<p>Return the isolated atom energies dictionary</p> <p>Returns:</p> Type Description <code>Dict[AtomSpecies, AtomEnergy]</code> <p>Dictionary with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_matrix","title":"<code>e0s_matrix: np.ndarray</code>  <code>property</code>","text":"<p>Return the isolated atom energies dictionary</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Matrix Array with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.__getitem__","title":"<code>__getitem__(item)</code>","text":"<p>Retrieve a key from the isolated atom dictionary. Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0.</p> <p>Examples:</p> <p>AtomEnergies[6], AtomEnergies[6,1], </p> <p>AtomEnergies[\"C\",1], AtomEnergies[(6,1)], </p> <p>AtomEnergies[(\"C,1)]</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <code>AtomSpecies</code> <p>AtomSpecies object or tuple with the atom symbol and charge</p> required <p>Returns:</p> Type Description <code>AtomEnergy</code> <p>AtomEnergy object with the isolated atom energy</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def __getitem__(self, item: AtomSpecies) -&gt; AtomEnergy:\n    \"\"\"\n    Retrieve a key from the isolated atom dictionary.\n    Item can be written as tuple(Symbol, charge),\n    tuple(Chemical number, charge). If no charge is passed,\n    it will be automatically set to 0.\n\n    Examples:\n        AtomEnergies[6], AtomEnergies[6,1], \\n\n        AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n        AtomEnergies[(\"C,1)]\n\n    Parameters:\n        item:\n            AtomSpecies object or tuple with the atom symbol and charge\n\n    Returns:\n        AtomEnergy object with the isolated atom energy\n    \"\"\"\n    try:\n        atom, charge = item[0], item[1]\n    except TypeError:\n        atom = item\n        charge = 0\n    except IndexError:\n        atom = item[0]\n        charge = 0\n    if not isinstance(atom, str):\n        atom = ATOM_SYMBOLS[atom]\n    return self.e0s_dict[(atom, charge)]\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy","title":"<code>AtomEnergy</code>  <code>dataclass</code>","text":"<p>Datastructure to store isolated atom energies and the std deviation associated to the value. By default the std will be 1 if no value was calculated or not available (formation energy case)</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>@dataclass\nclass AtomEnergy:\n    \"\"\"\n    Datastructure to store isolated atom energies\n    and the std deviation associated to the value.\n    By default the std will be 1 if no value was calculated\n    or not available (formation energy case)\n    \"\"\"\n\n    mean: np.array\n    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))\n\n    def __post_init__(self):\n        if not isinstance(self.mean, np.ndarray):\n            self.mean = np.array([self.mean], dtype=np.float32)\n\n    def append(self, other: \"AtomEnergy\"):\n        \"\"\"\n        Append the mean and std of another atom energy\n        \"\"\"\n        self.mean = np.append(self.mean, other.mean)\n        self.std = np.append(self.std, other.std)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy.append","title":"<code>append(other)</code>","text":"<p>Append the mean and std of another atom energy</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def append(self, other: \"AtomEnergy\"):\n    \"\"\"\n    Append the mean and std of another atom energy\n    \"\"\"\n    self.mean = np.append(self.mean, other.mean)\n    self.std = np.append(self.std, other.std)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomSpecies","title":"<code>AtomSpecies</code>  <code>dataclass</code>","text":"<p>Structure that defines a tuple of chemical specie and charge and provide hash and automatic conversion from atom number to checmical symbol</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>@dataclass(frozen=False, eq=True)\nclass AtomSpecies:\n    \"\"\"\n    Structure that defines a tuple of chemical specie and charge\n    and provide hash and automatic conversion from atom number to\n    checmical symbol\n    \"\"\"\n\n    symbol: Union[str, int]\n    charge: int = 0\n\n    def __post_init__(self):\n        if not isinstance(self.symbol, str):\n            self.symbol = ATOM_SYMBOLS[self.symbol]\n        self.number = ATOMIC_NUMBERS[self.symbol]\n\n    def __hash__(self):\n        return hash((self.symbol, self.charge))\n\n    def __eq__(self, other):\n        if not isinstance(other, AtomSpecies):\n            symbol, charge = other[0], other[1]\n            other = AtomSpecies(symbol=symbol, charge=charge)\n        return (self.number, self.charge) == (other.number, other.charge)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface","title":"<code>IsolatedEnergyInterface</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class that defines the interface for the different implementation of an isolated atom energy value</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class IsolatedEnergyInterface(ABC):\n    \"\"\"\n    Abstract class that defines the interface for the\n    different implementation of an isolated atom energy value\n    \"\"\"\n\n    def __init__(self, data, **kwargs):\n        \"\"\"\n        Parameters:\n            data : openqdc.datasets.Dataset\n                Dataset object that contains the information\n                about the isolated atom energies. Info will be passed\n                by references\n            kwargs : dict\n                Additional arguments that will be passed to the\n                selected energy class. Mostly used for regression\n                to pass the regressor_kwargs.\n        \"\"\"\n        self._e0_matrixs = []\n        self._e0_dict = None\n        self.kwargs = kwargs\n        self.data = data\n        self._post_init()\n\n    @property\n    def refit(self) -&gt; bool:\n        return self.data.refit_e0s\n\n    @abstractmethod\n    def _post_init(self):\n        \"\"\"\n        Main method to fetch/compute/recomputed the isolated atom energies.\n        Need to be implemented in all child classes.\n        \"\"\"\n        pass\n\n    def __len__(self):\n        return len(self.data.energy_methods)\n\n    @property\n    def e0_matrix(self) -&gt; np.ndarray:\n        \"\"\"\n        Return the isolated atom energies matrixes\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return np.array(self._e0_matrixs)\n\n    @property\n    def e0_dict(self) -&gt; Dict:\n        \"\"\"\n        Return the isolated atom energies dict\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n\n        return self._e0s_dict\n\n    def __str__(self) -&gt; str:\n        return self.__class__.__name__.lower()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_dict","title":"<code>e0_dict: Dict</code>  <code>property</code>","text":"<p>Return the isolated atom energies dict</p> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_matrix","title":"<code>e0_matrix: np.ndarray</code>  <code>property</code>","text":"<p>Return the isolated atom energies matrixes</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Matrix Array with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.__init__","title":"<code>__init__(data, **kwargs)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>data</code> <p>openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references</p> required <code>kwargs</code> <p>dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.</p> <code>{}</code> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def __init__(self, data, **kwargs):\n    \"\"\"\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n    \"\"\"\n    self._e0_matrixs = []\n    self._e0_dict = None\n    self.kwargs = kwargs\n    self.data = data\n    self._post_init()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.NullEnergy","title":"<code>NullEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that returns a null (zeros) matrix for the isolated atom energies in case of no energies are available.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class NullEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a null (zeros) matrix for the isolated atom energies in case\n    of no energies are available.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for _ in self.data.__energy_methods__:\n            for key, values in PotentialMethod.NONE.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]\n        self._assembly_e0_dict()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.PhysicalEnergy","title":"<code>PhysicalEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that returns a physical (SE,DFT,etc) isolated atom energies.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class PhysicalEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a physical (SE,DFT,etc) isolated atom energies.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for method in self.data.__energy_methods__:\n            for key, values in method.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]\n        self._assembly_e0_dict()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy","title":"<code>RegressionEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that compute and returns the regressed isolated atom energies.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class RegressionEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that compute and returns the regressed isolated atom energies.\n    \"\"\"\n\n    def _post_init(self):\n        if not self.attempt_load() or self.refit:\n            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)\n            E0s, cov = self._compute_regression_e0s()\n            self._set_lin_atom_species_dict(E0s, cov)\n        self._set_linear_e0s()\n\n    def _compute_regression_e0s(self) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Try to compute the regressed isolated atom energies.\n        raise an error if the regression fails.\n        return the regressed isolated atom energies and the uncertainty values.\n\n        Returns:\n            Tuple with the regressed isolated atom energies and the uncertainty values of the regression\n            if available.\n        \"\"\"\n        try:\n            E0s, cov = self.regressor.solve()\n        except np.linalg.LinAlgError:\n            logger.warning(f\"Failed to compute E0s using {self.regressor.solver_type} regression.\")\n            raise np.linalg.LinAlgError\n        return E0s, cov\n\n    def _set_lin_atom_species_dict(self, E0s, covs) -&gt; None:\n        \"\"\"\n        Set the regressed isolated atom energies in a dictionary format\n        and Save the values in a pickle file to easy loading.\n        \"\"\"\n        atomic_energies_dict = {}\n        for i, z in enumerate(self.regressor.numbers):\n            for charge in range(-10, 11):\n                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])\n            # atomic_energies_dict[z] = E0s[i]\n        self._e0s_dict = atomic_energies_dict\n        self.save_e0s()\n\n    def _set_linear_e0s(self) -&gt; None:\n        \"\"\"\n        Transform the e0s dictionary into the correct e0s\n        matrix format.\n        \"\"\"\n        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]\n        for z, e0 in self._e0s_dict.items():\n            for i in range(len(self)):\n                # new_e0s[i][z, :] = e0[i]\n                new_e0s[i][z.number, z.charge] = e0.mean[i]\n            # for atom_sp, values in\n        self._e0_matrixs = new_e0s\n\n    def save_e0s(self) -&gt; None:\n        \"\"\"\n        Save the regressed isolated atom energies in a pickle file.\n        \"\"\"\n        save_pkl(self._e0s_dict, self.preprocess_path)\n\n    def attempt_load(self) -&gt; bool:\n        \"\"\"\n        Try to load the regressed isolated atom energies from the\n        object pickle file and return the success of the operation.\n        \"\"\"\n        try:\n            self._e0s_dict = load_pkl(self.preprocess_path)\n            logger.info(f\"Found energy file for {str(self)}.\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Energy file for {str(self)} not found.\")\n            return False\n\n    @property\n    def preprocess_path(self):\n        \"\"\"\n        Return the path to the object pickle file.\n        \"\"\"\n        path = p_join(self.data.root, \"preprocessed\", str(self) + \".pkl\")\n        return path\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.preprocess_path","title":"<code>preprocess_path</code>  <code>property</code>","text":"<p>Return the path to the object pickle file.</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.attempt_load","title":"<code>attempt_load()</code>","text":"<p>Try to load the regressed isolated atom energies from the object pickle file and return the success of the operation.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def attempt_load(self) -&gt; bool:\n    \"\"\"\n    Try to load the regressed isolated atom energies from the\n    object pickle file and return the success of the operation.\n    \"\"\"\n    try:\n        self._e0s_dict = load_pkl(self.preprocess_path)\n        logger.info(f\"Found energy file for {str(self)}.\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Energy file for {str(self)} not found.\")\n        return False\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.save_e0s","title":"<code>save_e0s()</code>","text":"<p>Save the regressed isolated atom energies in a pickle file.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def save_e0s(self) -&gt; None:\n    \"\"\"\n    Save the regressed isolated atom energies in a pickle file.\n    \"\"\"\n    save_pkl(self._e0s_dict, self.preprocess_path)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.dispatch_factory","title":"<code>dispatch_factory(data, **kwargs)</code>","text":"<p>Factory function that select the correct energy class for the fetching/calculation of isolated atom energies.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <p>openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references</p> required <code>kwargs</code> <p>dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.</p> <code>{}</code> <p>Returns:</p> Type Description <code>IsolatedEnergyInterface</code> <p>Initialized IsolatedEnergyInterface-like object</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def dispatch_factory(data: Any, **kwargs: Dict) -&gt; \"IsolatedEnergyInterface\":\n    \"\"\"\n    Factory function that select the correct\n    energy class for the fetching/calculation\n    of isolated atom energies.\n\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n\n    Returns:\n        Initialized IsolatedEnergyInterface-like object\n    \"\"\"\n    if data.energy_type == \"formation\":\n        return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"regression\":\n        try:\n            return RegressionEnergy(data, **kwargs)\n        except np.linalg.LinAlgError:\n            logger.warning(\"Error! Using physical energies instead.\")\n            return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"null\":\n        return NullEnergy(data, **kwargs)\n</code></pre>"},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"<code>GeneralStructure</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract Factory class for datasets type in the openQDC package.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -&gt; Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -&gt; str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -&gt; List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -&gt; Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -&gt; any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"<code>load_fn: Callable</code>  <code>abstractmethod</code> <code>property</code>","text":"<p>Function to use for loading the data. Must be implemented by the child class.</p> <p>Returns:</p> Type Description <code>Callable</code> <p>the function to use for loading the data</p>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"<code>add_extension(filename)</code>","text":"<p>Add the correct extension to a filename</p> <p>Parameters:</p> Name Type Description Default <code>filename</code> <code>str</code> <p>the filename to add the extension to</p> required <p>Returns:</p> Type Description <code>str</code> <p>the filename with the extension</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def add_extension(self, filename: str) -&gt; str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"<code>join_and_ext(path, filename)</code>","text":"<p>Join a path and a filename and add the correct extension.</p> <p>Parameters:</p> Name Type Description Default <code>path</code> <code>Union[str, PathLike]</code> <p>the path to join</p> required <code>filename</code> <code>str</code> <p>the filename to join</p> required <p>Returns:</p> Type Description <code>Union[str, PathLike]</code> <p>the joined path with the correct extension</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def join_and_ext(self, path: Union[str, PathLike], filename: str) -&gt; Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"<code>load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)</code>","text":"<p>Main method to load the data from a filetype structure like memmap or zarr.</p> <p>Parameters:</p> Name Type Description Default <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>data_types</code> <code>Dict[str, dtype]</code> <p>dictionary of data types for each key</p> required <code>data_shapes</code> <code>Dict[str, Tuple[int, int]]</code> <p>dictionary of shapes for each key</p> required <code>extra_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra data file</p> required <code>overwrite</code> <code>bool</code> <p>whether to overwrite the local cache</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"<code>load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite)</code>  <code>abstractmethod</code>","text":"<p>Load extra files required to define other types of data. Must be implemented by the child class.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>Dict[str, ndarray]</code> <p>dictionary of data to load</p> required <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>pkl_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra files</p> required <code>overwrite</code> <code>bool</code> <p>whether to overwrite the local cache</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>@abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"<code>save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types)</code>  <code>abstractmethod</code>","text":"<p>Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.</p> <p>Parameters:</p> Name Type Description Default <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>data_dict</code> <code>Dict[str, ndarray]</code> <p>dictionary of data to save</p> required <code>extra_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra data file</p> required <code>extra_data_types</code> <code>Dict[str, type]</code> <p>dictionary of data types for each key</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>@abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -&gt; List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"<code>unpack(data)</code>","text":"<p>Unpack the data from the loaded file.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>any</code> <p>the data to unpack</p> required <p>Returns:</p> Type Description <code>any</code> <p>the unpacked data</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def unpack(self, data: any) -&gt; any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"<code>MemMapDataset</code>","text":"<p>               Bases: <code>GeneralStructure</code></p> <p>Dataset structure for memory-mapped numpy arrays and props.pkl files.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -&gt; List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"<code>ZarrDataset</code>","text":"<p>               Bases: <code>GeneralStructure</code></p> <p>Dataset structure for zarr files.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -&gt; List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n</code></pre>"},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"<code>InteractionMethod</code>","text":"<p>               Bases: <code>QmMethod</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get an empty atomization energy dictionary because Interaction methods don't require this</p>"},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"<code>PotentialMethod</code>","text":"<p>               Bases: <code>QmMethod</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get the atomization energy dictionary</p>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"<code>QmMethod</code>","text":"<p>               Bases: <code>Enum</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get the atomization energy dictionary</p>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"<code>atom_energies_matrix</code>  <code>property</code>","text":"<p>Get the atomization energy matrix</p>"},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"<code>to_e_matrix(atom_energies)</code>","text":"<p>Get the matrix of isolated atom energies for a dict of non-null values calculates</p> <p>Parameters:</p> Name Type Description Default <code>atom_energies</code> <code>Dict</code> <p>Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values</p> required <p>np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)</p> Type Description <code>ndarray</code> <p>Matrix containing the isolated atom energies for each atom and charge written in the form:</p> <pre><code>        |   | -2 | -1 | 0 | +1 | +2 | &lt;- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n</code></pre> Source code in <code>openqdc/methods/atom_energies.py</code> <pre><code>def to_e_matrix(atom_energies: Dict) -&gt; np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | &lt;- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) &gt; 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n</code></pre>"},{"location":"API/properties.html","title":"Defined properties for datasets","text":""},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn","title":"<code>DatasetPropertyMixIn</code>","text":"<p>Mixin class for BaseDataset class to add properties that are common to all datasets.</p> Source code in <code>openqdc/datasets/properties.py</code> <pre><code>class DatasetPropertyMixIn:\n    \"\"\"\n    Mixin class for BaseDataset class to add\n    properties that are common to all datasets.\n    \"\"\"\n\n    @property\n    def atoms_per_molecules(self):\n        try:\n            if hasattr(self, \"_n_atoms\"):\n                return self._n_atoms\n            self._n_atoms = self.data[\"n_atoms\"]\n            return self._n_atoms\n        except:  # noqa\n            return None\n\n    @property\n    def _stats(self):\n        return self.__stats__\n\n    def _compute_average_nb_atoms(self):\n        self.__average_nb_atoms__ = np.mean(self.data[\"n_atoms\"])\n\n    @property\n    def average_n_atoms(self) -&gt; int:\n        \"\"\"\n        Average number of atoms in a molecule in the dataset.\n\n        Returns:\n            Average number of atoms in a molecule in the dataset.\n        \"\"\"\n        if self.__average_nb_atoms__ is None:\n            raise StatisticsNotAvailableError(self.__name__)\n        return self.__average_nb_atoms__\n\n    @property\n    def numbers(self) -&gt; np.ndarray:\n        \"\"\"\n        Unique atomic numbers in the dataset\n\n        Returns:\n            Array of the unique atomic numbers in the dataset\n        \"\"\"\n        if hasattr(self, \"_numbers\"):\n            return self._numbers\n        self._numbers = pd.unique(self.data[\"atomic_inputs\"][..., 0]).astype(np.int32)\n        return self._numbers\n\n    @property\n    def charges(self) -&gt; np.ndarray:\n        \"\"\"\n        Unique charges in the dataset\n\n        Returns:\n            Array of the unique charges in the dataset\n        \"\"\"\n        if hasattr(self, \"_charges\"):\n            return self._charges\n        self._charges = np.unique(self.data[\"atomic_inputs\"][..., :2], axis=0).astype(np.int32)\n        return self._charges\n\n    @property\n    def min_max_charges(self) -&gt; Tuple[int, int]:\n        \"\"\"\n        Minimum and maximum charges in the dataset\n\n        Returns:\n            (min_charge, max_charge)\n        \"\"\"\n        if hasattr(self, \"_min_max_charges\"):\n            return self._min_max_charges\n        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])\n        return self._min_max_charges\n\n    @property\n    def chemical_species(self) -&gt; np.ndarray:\n        \"\"\"\n        Chemical symbols in the dataset\n\n        Returns:\n            Array of the chemical symbols in the dataset\n        \"\"\"\n        return np.array(ATOM_SYMBOLS)[self.numbers]\n</code></pre>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.average_n_atoms","title":"<code>average_n_atoms: int</code>  <code>property</code>","text":"<p>Average number of atoms in a molecule in the dataset.</p> <p>Returns:</p> Type Description <code>int</code> <p>Average number of atoms in a molecule in the dataset.</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.charges","title":"<code>charges: np.ndarray</code>  <code>property</code>","text":"<p>Unique charges in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the unique charges in the dataset</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.chemical_species","title":"<code>chemical_species: np.ndarray</code>  <code>property</code>","text":"<p>Chemical symbols in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the chemical symbols in the dataset</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.min_max_charges","title":"<code>min_max_charges: Tuple[int, int]</code>  <code>property</code>","text":"<p>Minimum and maximum charges in the dataset</p> <p>Returns:</p> Type Description <code>Tuple[int, int]</code> <p>(min_charge, max_charge)</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.numbers","title":"<code>numbers: np.ndarray</code>  <code>property</code>","text":"<p>Unique atomic numbers in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the unique atomic numbers in the dataset</p>"},{"location":"API/regressor.html","title":"Normalization regressor","text":"<p>Linear Atom Energies regression utilities.</p>"},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"<code>LinearSolver</code>","text":"<p>               Bases: <code>Solver</code></p> <p>Linear regression solver.</p> Note <p>No Uncertainty associated as it is quite small.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"<code>Regressor</code>","text":"<p>Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:</p> <p>X = [n_samples, n_species] (number of atoms of each species per sample)</p> <p>Y = [n_samples, ] (energies)</p> <p>The regression problem is solved by solving the linear system X E0 = Y.</p> Example <p>For a sytem of 2 samples (H20, CH4)</p> <pre><code>n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -&gt; X = [2, 1, 0]\n\nCH4 = 4C, 1H -&gt; X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n</code></pre> <p>Linear system to solve</p> <pre><code>[[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n</code></pre> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -&gt; X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -&gt; X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If &gt;1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -&gt; \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample &lt; 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -&gt; Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"<code>__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)</code>","text":"<p>Regressor class for preparing and solving regression problem for isolated atom energies.</p> <p>Parameters:</p> Name Type Description Default <code>energies</code> <code>ndarray</code> <p>numpy array of energies in the shape (n_samples, n_energy_methods)</p> required <code>atomic_numbers</code> <code>ndarray</code> <p>numpy array of atomic numbers in the shape (n_atoms,)</p> required <code>position_idx_range</code> <code>ndarray</code> <p>array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset</p> required <code>solver_type</code> <code>str</code> <p>Type of solver to use. [\"linear\", \"ridge\"]</p> <code>'linear'</code> <code>stride</code> <code>int</code> <p>Stride to use for the regression.</p> <code>1</code> <code>subsample</code> <code>Optional[Union[float, int]]</code> <p>Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If &gt;1 it is interpreted as the number of samples to use.</p> <code>None</code> <code>remove_nan</code> <code>bool</code> <p>Sanitize the dataset by removing energies samples with NaN values.</p> <code>True</code> <code>*args</code> <code>any</code> <p>Additional arguments to be passed to the regressor.</p> <code>()</code> <code>**kwargs</code> <code>any</code> <p>Additional keyword arguments to be passed to the regressor.</p> <code>{}</code> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If &gt;1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"<code>from_openqdc_dataset(dataset, *args, **kwargs)</code>  <code>classmethod</code>","text":"<p>Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.</p> <p>Parameters:</p> Name Type Description Default <code>dataset</code> <code>any</code> <p>openqdc dataset object.</p> required <code>*args</code> <code>any</code> <p>Additional arguments to be passed to the regressor.</p> <code>()</code> <code>**kwargs</code> <code>any</code> <p>Additional keyword arguments to be passed to the regressor.</p> <code>{}</code> <p>Returns:</p> Type Description <code>Regressor</code> <p>Instance of the regressor class.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>@classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -&gt; \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"<code>solve()</code>","text":"<p>Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"<code>RidgeSolver</code>","text":"<p>               Bases: <code>Solver</code></p> <p>Ridge regression solver.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li-&gt;i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"<code>Solver</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class for regression solvers.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"<code>solve(X, Y)</code>  <code>abstractmethod</code> <code>staticmethod</code>","text":"<p>Main method to solve the regression problem. Must be implemented in all the subclasses.</p> <p>Parameters:</p> Name Type Description Default <code>X</code> <code>ndarray</code> <p>Input features of shape (n_samples, n_species)</p> required <code>Y</code> <code>ndarray</code> <p>Target values of shape (n_samples,) (energy values for the regression)</p> required <p>Returns:</p> Type Description <code>Tuple[ndarray, Optional[ndarray]]</code> <p>Tuple of predicted values and the estimated uncertainty.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>@staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"<code>atom_standardization(X, y)</code>","text":"<p>Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"<code>non_nan_idxs(array)</code>","text":"<p>Return non nan indices of an array.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n</code></pre>"},{"location":"API/statistics.html","title":"Statistics","text":""},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator","title":"<code>AbstractStatsCalculator</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class that defines the interface for all the calculators object and the methods to compute the statistics.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class AbstractStatsCalculator(ABC):\n    \"\"\"\n    Abstract class that defines the interface for all\n    the calculators object and the methods to\n    compute the statistics.\n    \"\"\"\n\n    # State Dependencies of the calculator to skip part of the calculation\n    state_dependency = []\n    name = None\n\n    def __init__(\n        self,\n        name: str,\n        energy_type: Optional[str] = None,\n        force_recompute: bool = False,\n        energies: Optional[np.ndarray] = None,\n        n_atoms: Optional[np.ndarray] = None,\n        atom_species: Optional[np.ndarray] = None,\n        position_idx_range: Optional[np.ndarray] = None,\n        e0_matrix: Optional[np.ndarray] = None,\n        atom_charges: Optional[np.ndarray] = None,\n        forces: Optional[np.ndarray] = None,\n    ):\n        \"\"\"\n        Parameters:\n            name :\n                Name of the dataset for saving and loading.\n            energy_type :\n                Type of the energy for the computation of the statistics. Used for loading and saving.\n            force_recompute :\n                Flag to force the recomputation of the statistics\n            energies : n\n                Energies of the dataset\n            n_atoms :\n                Number of atoms in the dataset\n            atom_species :\n                Atomic species of the dataset\n            position_idx_range : n\n                Position index range of the dataset\n            e0_matrix :\n                Isolated atom energies matrix of the dataset\n            atom_charges :\n                Atomic charges of the dataset\n            forces :\n                Forces of the dataset\n        \"\"\"\n        self.name = name\n        self.energy_type = energy_type\n        self.force_recompute = force_recompute\n        self.energies = energies\n        self.forces = forces\n        self.position_idx_range = position_idx_range\n        self.e0_matrix = e0_matrix\n        self.n_atoms = n_atoms\n        self.atom_species_charges_tuple = (atom_species, atom_charges)\n        self._root = p_join(get_local_cache(), self.name)\n        if atom_species is not None and atom_charges is not None:\n            # by value not reference\n            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n\n    @property\n    def has_forces(self) -&gt; bool:\n        return self.forces is not None\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"statistics\", self.name + f\"_{str(self)}\" + \".pkl\")\n        return path\n\n    @property\n    def root(self):\n        \"\"\"\n        Path to the dataset folder\n        \"\"\"\n        return self._root\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset, recompute: bool = False):\n        \"\"\"\n        Create a calculator object from a dataset object.\n        \"\"\"\n        obj = cls(\n            name=dataset.__name__,\n            force_recompute=recompute,\n            energy_type=dataset.energy_type,\n            energies=dataset.data[\"energies\"],\n            forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n            n_atoms=dataset.data[\"n_atoms\"],\n            position_idx_range=dataset.data[\"position_idx_range\"],\n            atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n            atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n            e0_matrix=dataset.__isolated_atom_energies__,\n        )\n        obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n        return obj\n\n    @abstractmethod\n    def compute(self) -&gt; StatisticsResults:\n        \"\"\"\n        Abstract method to compute the statistics.\n        Must return a StatisticsResults object and be implemented\n        in all the childs\n        \"\"\"\n        raise NotImplementedError\n\n    def save_statistics(self) -&gt; None:\n        \"\"\"\n        Save statistics file to the dataset folder as a pkl file\n        \"\"\"\n        save_pkl(self.result, self.preprocess_path)\n\n    def attempt_load(self) -&gt; bool:\n        \"\"\"\n        Load precomputed statistics file and return the success of the operation\n        \"\"\"\n        try:\n            self.result = load_pkl(self.preprocess_path)\n            logger.info(f\"Statistics for {str(self)} loaded successfully\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n            return False\n\n    def _setup_deps(self, state: Dict) -&gt; None:\n        \"\"\"\n        Check if the dependencies of calculators are satisfied\n        from the state object and set the attributes of the calculator\n        to skip part of the calculation\n        \"\"\"\n        self.state = state\n        self.deps_satisfied = all([dep in state for dep in self.state_dependency])\n        if self.deps_satisfied:\n            for dep in self.state_dependency:\n                setattr(self, dep, state[dep])\n\n    def write_state(self, update: Dict) -&gt; None:\n        \"\"\"\n        Write/update the state dictionary with the update dictionary\n\n        update:\n            dictionary containing the update to the state\n        \"\"\"\n        self.state.update(update)\n\n    def run(self, state: Dict) -&gt; None:\n        \"\"\"\n        Main method to run the calculator.\n        Setup the dependencies from the state dictionary\n        Check if the statistics are already computed and load them or\n        recompute them\n        Save the statistics in the correct folder\n\n        state:\n            dictionary containing the state of the calculator\n        \"\"\"\n        self._setup_deps(state)\n        if self.force_recompute or not self.attempt_load():\n            self.result = self.compute()\n            self.save_statistics()\n\n    def __str__(self) -&gt; str:\n        return self.__class__.__name__.lower()\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.root","title":"<code>root</code>  <code>property</code>","text":"<p>Path to the dataset folder</p>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.__init__","title":"<code>__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>name</code> <p>Name of the dataset for saving and loading.</p> required <code>energy_type</code> <p>Type of the energy for the computation of the statistics. Used for loading and saving.</p> <code>None</code> <code>force_recompute</code> <p>Flag to force the recomputation of the statistics</p> <code>False</code> <code>energies</code> <p>n Energies of the dataset</p> <code>None</code> <code>n_atoms</code> <p>Number of atoms in the dataset</p> <code>None</code> <code>atom_species</code> <p>Atomic species of the dataset</p> <code>None</code> <code>position_idx_range</code> <p>n Position index range of the dataset</p> <code>None</code> <code>e0_matrix</code> <p>Isolated atom energies matrix of the dataset</p> <code>None</code> <code>atom_charges</code> <p>Atomic charges of the dataset</p> <code>None</code> <code>forces</code> <p>Forces of the dataset</p> <code>None</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def __init__(\n    self,\n    name: str,\n    energy_type: Optional[str] = None,\n    force_recompute: bool = False,\n    energies: Optional[np.ndarray] = None,\n    n_atoms: Optional[np.ndarray] = None,\n    atom_species: Optional[np.ndarray] = None,\n    position_idx_range: Optional[np.ndarray] = None,\n    e0_matrix: Optional[np.ndarray] = None,\n    atom_charges: Optional[np.ndarray] = None,\n    forces: Optional[np.ndarray] = None,\n):\n    \"\"\"\n    Parameters:\n        name :\n            Name of the dataset for saving and loading.\n        energy_type :\n            Type of the energy for the computation of the statistics. Used for loading and saving.\n        force_recompute :\n            Flag to force the recomputation of the statistics\n        energies : n\n            Energies of the dataset\n        n_atoms :\n            Number of atoms in the dataset\n        atom_species :\n            Atomic species of the dataset\n        position_idx_range : n\n            Position index range of the dataset\n        e0_matrix :\n            Isolated atom energies matrix of the dataset\n        atom_charges :\n            Atomic charges of the dataset\n        forces :\n            Forces of the dataset\n    \"\"\"\n    self.name = name\n    self.energy_type = energy_type\n    self.force_recompute = force_recompute\n    self.energies = energies\n    self.forces = forces\n    self.position_idx_range = position_idx_range\n    self.e0_matrix = e0_matrix\n    self.n_atoms = n_atoms\n    self.atom_species_charges_tuple = (atom_species, atom_charges)\n    self._root = p_join(get_local_cache(), self.name)\n    if atom_species is not None and atom_charges is not None:\n        # by value not reference\n        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.attempt_load","title":"<code>attempt_load()</code>","text":"<p>Load precomputed statistics file and return the success of the operation</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def attempt_load(self) -&gt; bool:\n    \"\"\"\n    Load precomputed statistics file and return the success of the operation\n    \"\"\"\n    try:\n        self.result = load_pkl(self.preprocess_path)\n        logger.info(f\"Statistics for {str(self)} loaded successfully\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n        return False\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.compute","title":"<code>compute()</code>  <code>abstractmethod</code>","text":"<p>Abstract method to compute the statistics. Must return a StatisticsResults object and be implemented in all the childs</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@abstractmethod\ndef compute(self) -&gt; StatisticsResults:\n    \"\"\"\n    Abstract method to compute the statistics.\n    Must return a StatisticsResults object and be implemented\n    in all the childs\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.from_openqdc_dataset","title":"<code>from_openqdc_dataset(dataset, recompute=False)</code>  <code>classmethod</code>","text":"<p>Create a calculator object from a dataset object.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@classmethod\ndef from_openqdc_dataset(cls, dataset, recompute: bool = False):\n    \"\"\"\n    Create a calculator object from a dataset object.\n    \"\"\"\n    obj = cls(\n        name=dataset.__name__,\n        force_recompute=recompute,\n        energy_type=dataset.energy_type,\n        energies=dataset.data[\"energies\"],\n        forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n        n_atoms=dataset.data[\"n_atoms\"],\n        position_idx_range=dataset.data[\"position_idx_range\"],\n        atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n        atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n        e0_matrix=dataset.__isolated_atom_energies__,\n    )\n    obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n    return obj\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.run","title":"<code>run(state)</code>","text":"<p>Main method to run the calculator. Setup the dependencies from the state dictionary Check if the statistics are already computed and load them or recompute them Save the statistics in the correct folder</p> state <p>dictionary containing the state of the calculator</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def run(self, state: Dict) -&gt; None:\n    \"\"\"\n    Main method to run the calculator.\n    Setup the dependencies from the state dictionary\n    Check if the statistics are already computed and load them or\n    recompute them\n    Save the statistics in the correct folder\n\n    state:\n        dictionary containing the state of the calculator\n    \"\"\"\n    self._setup_deps(state)\n    if self.force_recompute or not self.attempt_load():\n        self.result = self.compute()\n        self.save_statistics()\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.save_statistics","title":"<code>save_statistics()</code>","text":"<p>Save statistics file to the dataset folder as a pkl file</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def save_statistics(self) -&gt; None:\n    \"\"\"\n    Save statistics file to the dataset folder as a pkl file\n    \"\"\"\n    save_pkl(self.result, self.preprocess_path)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.write_state","title":"<code>write_state(update)</code>","text":"<p>Write/update the state dictionary with the update dictionary</p> update <p>dictionary containing the update to the state</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def write_state(self, update: Dict) -&gt; None:\n    \"\"\"\n    Write/update the state dictionary with the update dictionary\n\n    update:\n        dictionary containing the update to the state\n    \"\"\"\n    self.state.update(update)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.EnergyStatistics","title":"<code>EnergyStatistics</code>  <code>dataclass</code>","text":"<p>               Bases: <code>StatisticsResults</code></p> <p>Dataclass for energy related statistics</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@dataclass\nclass EnergyStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for energy related statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.ForceStatistics","title":"<code>ForceStatistics</code>  <code>dataclass</code>","text":"<p>               Bases: <code>StatisticsResults</code></p> <p>Dataclass for force statistics</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@dataclass\nclass ForceStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for force statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n    component_mean: Optional[np.ndarray]\n    component_std: Optional[np.ndarray]\n    component_rms: Optional[np.ndarray]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.ForcesCalculatorStats","title":"<code>ForcesCalculatorStats</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code></p> <p>Forces statistics calculator class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class ForcesCalculatorStats(AbstractStatsCalculator):\n    \"\"\"\n    Forces statistics calculator class\n    \"\"\"\n\n    def compute(self) -&gt; ForceStatistics:\n        if not self.has_forces:\n            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)\n        converted_force_data = self.forces\n        num_methods = converted_force_data.shape[2]\n        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)\n        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)\n        component_mean = np.nanmean(converted_force_data, axis=0)\n        component_std = np.nanstd(converted_force_data, axis=0)\n        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))\n        return ForceStatistics(\n            mean=np.atleast_2d(mean),\n            std=np.atleast_2d(std),\n            component_mean=np.atleast_2d(component_mean),\n            component_std=np.atleast_2d(component_std),\n            component_rms=np.atleast_2d(component_rms),\n        )\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyInterface","title":"<code>FormationEnergyInterface</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code>, <code>ABC</code></p> <p>Formation Energy interface calculator class. Define the use of the dependency formation_energy in the compute method</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class FormationEnergyInterface(AbstractStatsCalculator, ABC):\n    \"\"\"\n    Formation Energy interface calculator class.\n    Define the use of the dependency formation_energy in the\n    compute method\n    \"\"\"\n\n    state_dependency = [\"formation_energy\"]\n\n    def compute(self) -&gt; EnergyStatistics:\n        # if the state has not the dependency satisfied\n        if not self.deps_satisfied:\n            # run the main computation\n            from openqdc.utils.constants import MAX_CHARGE\n\n            splits_idx = self.position_idx_range[:, 1]\n            s = np.array(self.atom_species_charges_tuple, dtype=int)\n            s[:, 1] += MAX_CHARGE\n            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]\n            converted_energy_data = self.energies\n            E = []\n            for i, matrix in enumerate(matrixs):\n                c = np.cumsum(np.append([0], matrix))[splits_idx]\n                c[1:] = c[1:] - c[:-1]\n                E.append(converted_energy_data[:, i] - c)\n        else:\n            # if the dependency is satisfied get the dependency\n            E = getattr(self, self.state_dependency[0])\n        self.write_state({self.state_dependency[0]: E})\n        E = np.array(E).T\n        return self._compute(E)\n\n    @abstractmethod\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        raise NotImplementedError\n\n    def __str__(self) -&gt; str:\n        # override the __str__ method to add the energy type to the name\n        # to differentiate between formation and regression type\n        return f\"{self.__class__.__name__.lower()}_{self.energy_type.lower()}\"\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyStats","title":"<code>FormationEnergyStats</code>","text":"<p>               Bases: <code>FormationEnergyInterface</code></p> <p>Formation Energy  calculator class.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class FormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        formation_E_mean = np.nanmean(energy, axis=0)\n        formation_E_std = np.nanstd(energy, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.PerAtomFormationEnergyStats","title":"<code>PerAtomFormationEnergyStats</code>","text":"<p>               Bases: <code>FormationEnergyInterface</code></p> <p>Per atom Formation Energy  calculator class.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class PerAtomFormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Per atom Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)\n        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager","title":"<code>StatisticManager</code>","text":"<p>Manager class that automatically handle the shared state between the statistic calculators</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class StatisticManager:\n    \"\"\"\n    Manager class that automatically handle the shared state between\n    the statistic calculators\n    \"\"\"\n\n    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n        \"\"\"\n        Parameters:\n            dataset : openqdc.datasets.base.BaseDataset\n                The dataset object to compute the statistics\n            recompute:\n                Flag to recompute the statistics\n            *statistic_calculators:\n                List of statistic calculators to run\n        \"\"\"\n        self._state = {}\n        self._results = {}\n        self._statistic_calculators = [\n            statistic_calculators.from_openqdc_dataset(dataset, recompute)\n            for statistic_calculators in statistic_calculators\n        ]\n\n    @property\n    def state(self) -&gt; Dict:\n        \"\"\"\n        Return the dictionary state of the manager\n\n        Returns:\n            State of the StatisticManager\n        \"\"\"\n        return self._state\n\n    def reset_state(self):\n        \"\"\"\n        Reset the state dictionary\n        \"\"\"\n        self._state = {}\n\n    def reset_results(self):\n        \"\"\"\n        Reset the results dictionary\n        \"\"\"\n        self._results = {}\n\n    def get_state(self, key: Optional[str] = None) -&gt; Optional[Any]:\n        \"\"\"\n        Return the value of the key in the state dictionary\n\n        Parameters:\n            key: str, default = None\n        Returns:\n            the value of the key in the state dictionary\n            or the whole state dictionary if key is None\n        \"\"\"\n        if key is None:\n            return self._state\n        return self._state.get(key, None)\n\n    def has_state(self, key: str) -&gt; bool:\n        \"\"\"\n        Check is state has key\n\n        Parameters:\n            key:\n                Key to check in the state dictionary\n\n        Returns:\n            True if the key is in the state dictionary\n        \"\"\"\n        return key in self._state\n\n    def get_results(self, as_dict: bool = False):\n        \"\"\"\n        Aggregate results from all the calculators\n\n        Parameters:\n            as_dict:\n                Flag to return the results as a dictionary\n        \"\"\"\n        results = deepcopy(self._results)\n        if as_dict:\n            return {k: v.as_dict() for k, v in results.items()}\n        return {k: v for k, v in self._results.items()}\n\n    def run_calculators(self):\n        \"\"\"\n        Run the saved calculators and save the results in the manager\n        \"\"\"\n        logger.info(\"Processing dataset statistics\")\n        for calculator in self._statistic_calculators:\n            calculator.run(self.state)\n            self._results[calculator.__class__.__name__] = calculator.result\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.state","title":"<code>state: Dict</code>  <code>property</code>","text":"<p>Return the dictionary state of the manager</p> <p>Returns:</p> Type Description <code>Dict</code> <p>State of the StatisticManager</p>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.__init__","title":"<code>__init__(dataset, recompute=False, *statistic_calculators)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>dataset</code> <p>openqdc.datasets.base.BaseDataset The dataset object to compute the statistics</p> required <code>recompute</code> <code>bool</code> <p>Flag to recompute the statistics</p> <code>False</code> <code>*statistic_calculators</code> <code>AbstractStatsCalculator</code> <p>List of statistic calculators to run</p> <code>()</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n    \"\"\"\n    Parameters:\n        dataset : openqdc.datasets.base.BaseDataset\n            The dataset object to compute the statistics\n        recompute:\n            Flag to recompute the statistics\n        *statistic_calculators:\n            List of statistic calculators to run\n    \"\"\"\n    self._state = {}\n    self._results = {}\n    self._statistic_calculators = [\n        statistic_calculators.from_openqdc_dataset(dataset, recompute)\n        for statistic_calculators in statistic_calculators\n    ]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_results","title":"<code>get_results(as_dict=False)</code>","text":"<p>Aggregate results from all the calculators</p> <p>Parameters:</p> Name Type Description Default <code>as_dict</code> <code>bool</code> <p>Flag to return the results as a dictionary</p> <code>False</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def get_results(self, as_dict: bool = False):\n    \"\"\"\n    Aggregate results from all the calculators\n\n    Parameters:\n        as_dict:\n            Flag to return the results as a dictionary\n    \"\"\"\n    results = deepcopy(self._results)\n    if as_dict:\n        return {k: v.as_dict() for k, v in results.items()}\n    return {k: v for k, v in self._results.items()}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_state","title":"<code>get_state(key=None)</code>","text":"<p>Return the value of the key in the state dictionary</p> <p>Parameters:</p> Name Type Description Default <code>key</code> <code>Optional[str]</code> <p>str, default = None</p> <code>None</code> <p>Returns:     the value of the key in the state dictionary     or the whole state dictionary if key is None</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def get_state(self, key: Optional[str] = None) -&gt; Optional[Any]:\n    \"\"\"\n    Return the value of the key in the state dictionary\n\n    Parameters:\n        key: str, default = None\n    Returns:\n        the value of the key in the state dictionary\n        or the whole state dictionary if key is None\n    \"\"\"\n    if key is None:\n        return self._state\n    return self._state.get(key, None)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.has_state","title":"<code>has_state(key)</code>","text":"<p>Check is state has key</p> <p>Parameters:</p> Name Type Description Default <code>key</code> <code>str</code> <p>Key to check in the state dictionary</p> required <p>Returns:</p> Type Description <code>bool</code> <p>True if the key is in the state dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def has_state(self, key: str) -&gt; bool:\n    \"\"\"\n    Check is state has key\n\n    Parameters:\n        key:\n            Key to check in the state dictionary\n\n    Returns:\n        True if the key is in the state dictionary\n    \"\"\"\n    return key in self._state\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_results","title":"<code>reset_results()</code>","text":"<p>Reset the results dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def reset_results(self):\n    \"\"\"\n    Reset the results dictionary\n    \"\"\"\n    self._results = {}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_state","title":"<code>reset_state()</code>","text":"<p>Reset the state dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def reset_state(self):\n    \"\"\"\n    Reset the state dictionary\n    \"\"\"\n    self._state = {}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.run_calculators","title":"<code>run_calculators()</code>","text":"<p>Run the saved calculators and save the results in the manager</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def run_calculators(self):\n    \"\"\"\n    Run the saved calculators and save the results in the manager\n    \"\"\"\n    logger.info(\"Processing dataset statistics\")\n    for calculator in self._statistic_calculators:\n        calculator.run(self.state)\n        self._results[calculator.__class__.__name__] = calculator.result\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults","title":"<code>StatisticsResults</code>","text":"<p>Parent class to statistics results to provide general methods.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class StatisticsResults:\n    \"\"\"\n    Parent class to statistics results\n    to provide general methods.\n    \"\"\"\n\n    def to_dict(self) -&gt; Dict:\n        \"\"\"\n        Convert the class to a dictionary\n\n        Returns:\n            Dictionary representation of the class\n        \"\"\"\n        return asdict(self)\n\n    def transform(self, func: Callable):\n        \"\"\"\n        Apply a function to all the attributes of the class\n\n        Parameters:\n            func:\n                Function to apply to the attributes\n        \"\"\"\n        for k, v in self.to_dict().items():\n            if v is not None:\n                setattr(self, k, func(v))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.to_dict","title":"<code>to_dict()</code>","text":"<p>Convert the class to a dictionary</p> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary representation of the class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def to_dict(self) -&gt; Dict:\n    \"\"\"\n    Convert the class to a dictionary\n\n    Returns:\n        Dictionary representation of the class\n    \"\"\"\n    return asdict(self)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.transform","title":"<code>transform(func)</code>","text":"<p>Apply a function to all the attributes of the class</p> <p>Parameters:</p> Name Type Description Default <code>func</code> <code>Callable</code> <p>Function to apply to the attributes</p> required Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def transform(self, func: Callable):\n    \"\"\"\n    Apply a function to all the attributes of the class\n\n    Parameters:\n        func:\n            Function to apply to the attributes\n    \"\"\"\n    for k, v in self.to_dict().items():\n        if v is not None:\n            setattr(self, k, func(v))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.TotalEnergyStats","title":"<code>TotalEnergyStats</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code></p> <p>Total Energy statistics calculator class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class TotalEnergyStats(AbstractStatsCalculator):\n    \"\"\"\n    Total Energy statistics calculator class\n    \"\"\"\n\n    def compute(self) -&gt; EnergyStatistics:\n        converted_energy_data = self.energies\n        total_E_mean = np.nanmean(converted_energy_data, axis=0)\n        total_E_std = np.nanstd(converted_energy_data, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))\n</code></pre>"},{"location":"API/units.html","title":"UNITS","text":"<p>Units conversion utilities module.</p> Available Energy units <p>[\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]</p> Available Distance units <p>[\"ang\", \"nm\", \"bohr\"]</p> Available Force units <p>Combinations between Energy and Distance units</p>"},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"<code>Conversion</code>","text":"<p>Conversion from one unit system to another defined by a name and a callable</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"<code>__init__(in_unit, out_unit, func)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>in_unit</code> <code>str</code> <p>String defining the units of the current values</p> required <code>out_unit</code> <code>str</code> <p>String defining the target units</p> required <code>func</code> <code>Callable[[float], float]</code> <p>The callable to compute the conversion</p> required Source code in <code>openqdc/utils/units.py</code> <pre><code>def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"<code>DistanceTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code>, <code>StrEnum</code></p> <p>Define the possible distance units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"<code>to(distance, fraction=False)</code>","text":"<p>Get the conversion function to convert the distance to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>distance</code> <code>DistanceTypeConversion</code> <p>distance unit to convert to</p> required <code>fraction</code> <code>bool</code> <p>whether it is distance^1 or distance^-1</p> <code>False</code> <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"<code>EnergyTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code>, <code>StrEnum</code></p> <p>Define the possible energy units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"<code>to(energy)</code>","text":"<p>Get the conversion function to convert the energy to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>energy</code> <code>EnergyTypeConversion</code> <p>energy unit to convert to</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>Callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, energy: \"EnergyTypeConversion\") -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"<code>ForceTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code></p> <p>Define the possible foce units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"<code>to(energy, distance)</code>","text":"<p>Get the conversion function to convert the force to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>energy</code> <code>EnergyTypeConversion</code> <p>energy unit to convert to</p> required <code>distance</code> <code>DistanceTypeConversion</code> <p>distance unit to convert to</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"<code>get_conversion(in_unit, out_unit)</code>","text":"<p>Utility function to get the conversion function between two units.</p> <p>Parameters:</p> Name Type Description Default <code>in_unit</code> <p>The input unit</p> required <code>out_unit</code> <p>The output unit</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>The conversion function</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def get_conversion(in_unit: str, out_unit: str) -&gt; Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n</code></pre>"},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"<code>check_file(path)</code>","text":"<p>Checks if file present on local</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def check_file(path) -&gt; bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"<code>create_hdf5_file(hdf5_file_path)</code>","text":"<p>Creates hdf5 file with fsspec</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"<code>get_conversion(in_unit, out_unit)</code>","text":"<p>Utility function to get the conversion function between two units.</p> <p>Parameters:</p> Name Type Description Default <code>in_unit</code> <p>The input unit</p> required <code>out_unit</code> <p>The output unit</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>The conversion function</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def get_conversion(in_unit: str, out_unit: str) -&gt; Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"<code>get_local_cache()</code>","text":"<p>Returns the local cache directory. It creates it if it does not exist.</p> <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>path to the local cache directory</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def get_local_cache() -&gt; str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"<code>get_remote_cache(write_access=False)</code>","text":"<p>Returns the entry point based on the write access.</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def get_remote_cache(write_access=False) -&gt; str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"<code>load_hdf5_file(hdf5_file_path)</code>","text":"<p>Loads hdf5 file with fsspec</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_json","title":"<code>load_json(path)</code>","text":"<p>Loads json file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"<code>load_pkl(path, check=True)</code>","text":"<p>Load pkl file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.makedirs","title":"<code>makedirs(path, exist_ok=True)</code>","text":"<p>Creates directory</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"<code>read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)</code>","text":"<p>Extracts data from the HDF5 archive file.</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -&gt; List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"<code>save_pkl(file, path)</code>","text":"<p>Saves pkl file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"<code>set_cache_dir(d)</code>","text":"<p>Optionally set the _OPENQDC_CACHE_DIR directory.</p> <p>Parameters:</p> Name Type Description Default <code>d</code> <code>str</code> <p>path to a local folder.</p> required Source code in <code>openqdc/utils/io.py</code> <pre><code>def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n</code></pre>"},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"<code>Alchemy</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.</p> <p>Usage: <pre><code>from openqdc.datasets import Alchemy\ndataset = Alchemy()\n</code></pre></p> Reference <p>https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/</p> Source code in <code>openqdc/datasets/potential/alchemy.py</code> <pre><code>class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n</code></pre>"},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"<code>ANI1</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1\ndataset = ANI1()\n</code></pre></p> References <p>https://www.nature.com/articles/sdata2017193</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"<code>ANI1CCX</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n</code></pre></p> References <p>https://doi.org/10.1038/s41467-019-10827-4</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        return x.decode(\"ascii\")\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"<code>ANI1CCX_V2</code>","text":"<p>               Bases: <code>ANI1CCX</code></p> <p>ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n</code></pre></p> References <p>https://doi.org/10.1038/s41467-019-10827-4</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"<code>ANI1X</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1X\ndataset = ANI1X()\n</code></pre></p> References <p>https://doi.org/10.1063/1.5023802</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"<code>ANI2X</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI2X\ndataset = ANI2X()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"<code>COMP6</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.</p> Details of the benchmark sets are as follows <p>S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and</p> <p>mixed influence interactions.</p> <pre><code>ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n</code></pre> <p>dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.</p> <pre><code>GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n</code></pre> <p>The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.</p> <pre><code>GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n</code></pre> <p>and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.</p> <pre><code>Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n</code></pre> <p>Structures are optimized similar to GDB7to9.</p> <p>Usage: <pre><code>from openqdc.datasets import COMP6\ndataset = COMP6()\n</code></pre></p> References <p>https://aip.scitation.org/doi/abs/10.1063/1.5023802</p> <p>https://github.com/isayev/COMP6</p> <p>S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d</p> <p>GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/</p> <p>GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/</p> <p>DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h</p> Source code in <code>openqdc/datasets/potential/comp6.py</code> <pre><code>class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/comp6.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"<code>DES370K</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code>, <code>IDES</code></p> <p>DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.</p> <p>Usage: <pre><code>from openqdc.datasets import DES370K\ndataset = DES370K()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"<code>DES5M</code>","text":"<p>               Bases: <code>DES370K</code></p> <p>DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.</p> <p>Usage: <pre><code>from openqdc.datasets import DES5M\ndataset = DES5M()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"<code>DESS66</code>","text":"<p>               Bases: <code>DES370K</code></p> <p>DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.</p> <p>Usage: <pre><code>from openqdc.datasets import DESS66\ndataset = DESS66()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> <p>S66: https://pubs.acs.org/doi/10.1021/ct2002946</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"<code>DESS66x8</code>","text":"<p>               Bases: <code>DESS66</code></p> <p>DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.</p> <p>Usage: <pre><code>from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n</code></pre>"},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"<code>GDML</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.</p> The dataset consists of the following trajectories <p>Benzene: 627000 samples</p> <p>Uracil: 133000 samples</p> <p>Naptalene: 326000 samples</p> <p>Aspirin: 211000 samples</p> <p>Salicylic Acid: 320000 samples</p> <p>Malonaldehyde: 993000 samples</p> <p>Ethanol: 555000 samples</p> <p>Toluene: 100000 samples</p> <p>Usage: <pre><code>from openqdc.datasets import GDML\ndataset = GDML()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets</p> Source code in <code>openqdc/datasets/potential/gdml.py</code> <pre><code>class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/geom.html","title":"GEOM","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.</p> <p>Usage: <pre><code>from openqdc.datasets import GEOM\ndataset = GEOM()\n</code></pre></p> References <p>https://www.nature.com/articles/s41597-022-01288-4</p> <p>https://github.com/learningmatter-mit/geom</p> <p>CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d</p> Source code in <code>openqdc/datasets/potential/geom.py</code> <pre><code>class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n</code></pre>"},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"<code>ISO17</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.</p> <p>Usage: <pre><code>from openqdc.datasets import ISO17\ndataset = ISO17()\n</code></pre></p> References <p>https://arxiv.org/abs/1706.08566</p> <p>https://arxiv.org/abs/1609.08259</p> <p>https://www.nature.com/articles/sdata201422</p> <p>https://pubmed.ncbi.nlm.nih.gov/10062328/</p> <p>https://pubmed.ncbi.nlm.nih.gov/19257665/</p> Source code in <code>openqdc/datasets/potential/iso_17.py</code> <pre><code>class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/iso_17.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"<code>L7</code>","text":"<p>               Bases: <code>YamlDataset</code></p> <p>The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.</p> <p>Usage: <pre><code>from openqdc.datasets import L7\ndataset = L7()\n</code></pre></p> Reference <p>https://pubs.acs.org/doi/10.1021/ct400036b</p> Source code in <code>openqdc/datasets/interaction/l7.py</code> <pre><code>class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n</code></pre>"},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"<code>MD22</code>","text":"<p>               Bases: <code>RevMD17</code></p> <p>MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import MD22\ndataset = MD22()\n</code></pre></p> Reference <p>https://arxiv.org/abs/2209.14865</p> Source code in <code>openqdc/datasets/potential/md22.py</code> <pre><code>class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n</code></pre>"},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"<code>Metcalf</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code></p> <p>Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.</p> <p>Usage: <pre><code>from openqdc.datasets import Metcalf\ndataset = Metcalf()\n</code></pre></p> Reference <p>https://doi.org/10.1063/1.5142636</p> Source code in <code>openqdc/datasets/interaction/metcalf.py</code> <pre><code>class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n</code></pre>"},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"<code>Molecule3D</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.</p> <p>Usage: <pre><code>from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n</code></pre></p> References <p>https://arxiv.org/abs/2110.01717</p> <p>https://github.com/divelab/MoleculeX</p> Source code in <code>openqdc/datasets/potential/molecule3d.py</code> <pre><code>class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n</code></pre>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"<code>read_mol(mol, energy)</code>","text":"<p>Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies</p>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"<p>mol: Chem.rdchem.Mol     RDKit molecule energy: float     Energy of the molecule</p>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"<p>res: dict     Dictionary containing the following keys:     - name: np.ndarray of shape (N,) containing the smiles of the molecule     - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions     - energies: np.ndarray of shape (1,) containing the energy of the conformer     - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer     - subset: np.ndarray of shape (1) containing \"molecule3d\"</p> Source code in <code>openqdc/datasets/potential/molecule3d.py</code> <pre><code>def read_mol(mol: Chem.rdchem.Mol, energy: float) -&gt; Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n</code></pre>"},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"<code>MultixcQM9</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.</p> <p>Usage: <pre><code>from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n</code></pre></p> References <p>https://www.nature.com/articles/s41597-023-02690-2</p> <p>https://github.com/chemsurajit/largeDFTdata</p> <p>https://www.nature.com/articles/s41597-019-0121-7</p> Source code in <code>openqdc/datasets/potential/multixcqm9.py</code> <pre><code>class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n</code></pre>"},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"<code>NablaDFT</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.</p> <p>Usage: <pre><code>from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n</code></pre></p> References <p>https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D</p> <p>https://github.com/AIRI-Institute/nablaDFT</p> Source code in <code>openqdc/datasets/potential/nabladft.py</code> <pre><code>class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n</code></pre>"},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"<code>OrbnetDenali</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n</code></pre></p> References <p>https://arxiv.org/abs/2107.00299</p> <p>https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867</p> Source code in <code>openqdc/datasets/potential/orbnet_denali.py</code> <pre><code>class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n</code></pre>"},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"<code>PCQM_B3LYP</code>","text":"<p>               Bases: <code>PCQM_PM6</code></p> <p>PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.</p> <p>Usage: <pre><code>from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n</code></pre></p> References <p>https://arxiv.org/abs/2305.18454</p> Source code in <code>openqdc/datasets/potential/pcqm.py</code> <pre><code>class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n</code></pre>"},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"<code>PCQM_PM6</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.</p> <p>Usage: <pre><code>from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n</code></pre></p> References <p>https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740</p> Source code in <code>openqdc/datasets/potential/pcqm.py</code> <pre><code>class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) &gt; 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n</code></pre>"},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"<code>MDDataset</code>","text":"<p>               Bases: <code>ProteinFragments</code></p> <p>MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:</p> Subsets <p>Polyalanine:     All the polyalanine are sampled in gas phase. AceAla15Lys is     a polyalanine peptides capped with an N-terminal acetyl group     and a protonated lysine residue at the C-terminus,     Acela15nme is polyalanine peptide capped with an N-terminal acetyl group     and a C-terminal N-methyl amide group</p> <p>Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)</p> <p>Usage: <pre><code>from openqdc.datasets import MDDataset\ndataset = MDDataset()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.adn4397</p> Source code in <code>openqdc/datasets/potential/proteinfragments.py</code> <pre><code>class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n</code></pre>"},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"<code>ProteinFragments</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:</p> Top-down <p>Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.</p> Bottom-up <p>Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.</p> <p>Usage: <pre><code>from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.adn4397</p> Source code in <code>openqdc/datasets/potential/proteinfragments.py</code> <pre><code>class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"<code>QM1B</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.</p> <p>Usage: <pre><code>from openqdc.datasets import QM1B\ndataset = QM1B()\n</code></pre></p> References <p>https://arxiv.org/pdf/2311.01135</p> <p>https://github.com/graphcore-research/qm1b-dataset/</p> Source code in <code>openqdc/datasets/potential/qm1b.py</code> <pre><code>class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n</code></pre>"},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"<code>QM1B_SMALL</code>","text":"<p>               Bases: <code>QM1B</code></p> <p>QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.</p> <p>Usage: <pre><code>from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qm1b.py</code> <pre><code>class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n</code></pre>"},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"<code>QM7X</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.</p> <p>Usage: <pre><code>from openqdc.datasets import QM7X\ndataset = QM7X()\n</code></pre></p> References <p>https://arxiv.org/abs/2006.15139</p> <p>https://zenodo.org/records/4288677</p> Source code in <code>openqdc/datasets/potential/qm7x.py</code> <pre><code>class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n</code></pre>"},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"<code>QM7X_V2</code>","text":"<p>               Bases: <code>QM7X</code></p> <p>QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qm7x.py</code> <pre><code>class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n</code></pre>"},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"<code>QMugs</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).</p> <p>Usage: <pre><code>from openqdc.datasets import QMugs\ndataset = QMugs()\n</code></pre></p> References <p>https://arxiv.org/abs/2107.00367</p> <p>https://www.nature.com/articles/s41597-022-01390-7#ethics</p> <p>https://www.research-collection.ethz.ch/handle/20.500.11850/482129</p> Source code in <code>openqdc/datasets/potential/qmugs.py</code> <pre><code>class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&amp;files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&amp;files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n</code></pre>"},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"<code>QMugs_V2</code>","text":"<p>               Bases: <code>QMugs</code></p> <p>QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qmugs.py</code> <pre><code>class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n</code></pre>"},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"<code>QM7</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.</p> Chemical species <p>[C, N, O, S, H]</p> <p>Usage: <pre><code>from openqdc.datasets import QM7\ndataset = QM7()\n</code></pre></p> References <p>https://arxiv.org/pdf/1703.00564</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"<code>QM7b</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.</p> Chemical species <p>[C, N, O, S, Cl, H]</p> <p>Usage: <pre><code>from openqdc.datasets import QM7b\ndataset = QM7b()\n</code></pre></p> References <p>https://arxiv.org/pdf/1703.00564</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"<code>QM8</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.</p> <p>Usage: <pre><code>from openqdc.datasets import QM8\ndataset = QM8()\n</code></pre></p> References <p>https://arxiv.org/pdf/1504.01966</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"<code>QM9</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.</p> <p>Usage: <pre><code>from openqdc.datasets import QM9\ndataset = QM9()\n</code></pre></p> Reference <p>https://www.nature.com/articles/sdata201422</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"<code>QMX</code>","text":"<p>               Bases: <code>ABC</code>, <code>BaseDataset</code></p> <p>QMX dataset base abstract class</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n</code></pre>"},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"<code>RevMD17</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules:     Benzene: 627000 samples</p> <pre><code>Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n</code></pre> <p>Usage: <pre><code>from openqdc.datasets import RevMD17\ndataset = RevMD17()\n</code></pre></p> References <p>https://arxiv.org/abs/2007.09593</p> Source code in <code>openqdc/datasets/potential/revmd17.py</code> <pre><code>class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n</code></pre>"},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"<code>SN2RXN</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -&gt; CH3X +  Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.</p> <p>Usage: <pre><code>from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.9b00181</p> <p>https://zenodo.org/records/2605341</p> Source code in <code>openqdc/datasets/potential/sn2_rxn.py</code> <pre><code>class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -&gt; CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"<code>SolvatedPeptides</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.9b00181</p> <p>https://zenodo.org/records/2605372</p> Source code in <code>openqdc/datasets/potential/solvated_peptides.py</code> <pre><code>class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/solvated_peptides.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"<code>Spice</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import Spice\ndataset = Spice()\n</code></pre></p> References <p>https://arxiv.org/abs/2209.10702</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"<code>SpiceV2</code>","text":"<p>               Bases: <code>Spice</code></p> <p>SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.</p> <p>Usage: <pre><code>from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n</code></pre></p> References <p>https://github.com/openmm/spice-dataset/releases/tag/2.0.0</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"<code>SpiceVL2</code>","text":"<p>               Bases: <code>SpiceV2</code></p> <p>SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.</p> <p>Usage: <pre><code>from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n</code></pre></p> References <p>https://github.com/openmm/spice-dataset/releases/tag/2.0.0</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"<code>read_record(r, obj)</code>","text":"<p>Read record from hdf5 file.     r : hdf5 record     obj : Spice class object used to grab subset and names</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n</code></pre>"},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"<code>Splinter</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code></p> <p>Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.</p> <p>Usage: <pre><code>from openqdc.datasets import Splinter\ndataset = Splinter()\n</code></pre></p> Reference <p>https://doi.org/10.1038/s41597-023-02443-1</p> Source code in <code>openqdc/datasets/interaction/splinter.py</code> <pre><code>class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n</code></pre>"},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"<code>TMQM</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.</p> <p>Usage: <pre><code>from openqdc.datasets import TMQM\ndataset = TMQM()\n</code></pre></p> References <p>https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041</p> <p>https://github.com/bbskjelstad/tmqm</p> Source code in <code>openqdc/datasets/potential/tmqm.py</code> <pre><code>class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n</code></pre>"},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"<code>Transition1X</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.</p> <p>Usage: <pre><code>from openqdc.datasets import Transition1X\ndataset = Transition1X()\n</code></pre></p> <p>References: - https://www.nature.com/articles/s41597-022-01870-w</p> <ul> <li>https://gitlab.com/matschreiner/Transition1x</li> </ul> Source code in <code>openqdc/datasets/potential/transition1x.py</code> <pre><code>class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n</code></pre>"},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"<code>VQM24</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.</p> <p>Usage: <pre><code>from openqdc.datasets import VQM24\ndataset = VQM24()\n</code></pre></p> Reference <p>https://arxiv.org/abs/2405.05961</p> Source code in <code>openqdc/datasets/potential/vqm24.py</code> <pre><code>class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"<code>SCANWaterClusters</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.</p> Chemical Species <p>[H, O, Li, Na, K, F, Cl, Br]</p> <p>Usage: <pre><code>from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n</code></pre></p> References <p>https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec</p> <p>https://github.com/esoteric-ephemera/water_cluster_density_errors</p> Source code in <code>openqdc/datasets/potential/waterclusters.py</code> <pre><code>class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n</code></pre>"},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"<code>WaterClusters</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.</p> Chemical Species <p>[\"H\", \"O\"]</p> <p>Usage: <pre><code>from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n</code></pre></p> References <p>https://doi.org/10.1063/1.5128378</p> <p>https://sites.uw.edu/wdbase/database-of-water-clusters/</p> Source code in <code>openqdc/datasets/potential/waterclusters3_30.py</code> <pre><code>class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n</code></pre>"},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"<code>X40</code>","text":"<p>               Bases: <code>YamlDataset</code></p> <p>X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import X40\ndataset = X40()\n</code></pre></p> Reference <p>https://pubs.acs.org/doi/10.1021/ct300647k</p> Source code in <code>openqdc/datasets/interaction/x40.py</code> <pre><code>class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n</code></pre>"},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied! <pre>from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n</pre> from openqdc.datasets import Spice ds = Spice(     energy_unit=\"kcal/mol\",     distance_unit=\"ang\", )  <pre>2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype &lt;U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype &lt;U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n</pre> In\u00a0[39]: Copied! <pre>ds[0]\n</pre> ds[0] Out[39]: <pre>{'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}</pre> In\u00a0[40]: Copied! <pre>ds.get_ase_atoms(0)\n</pre> ds.get_ase_atoms(0) Out[40]: <pre>Atoms(symbols='C8NH18', pbc=False, initial_charges=...)</pre> In\u00a0[53]: Copied! <pre>ds.get_ase_atoms(0).info\n</pre> ds.get_ase_atoms(0).info Out[53]: <pre>{'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}</pre> In\u00a0[41]: Copied! <pre>for i in ds.as_iter():\n    print(i)\n    break\n</pre> for i in ds.as_iter():     print(i)     break <pre>{'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n</pre> In\u00a0[42]: Copied! <pre>for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n</pre> for i in ds.as_iter(atoms=True):     print(i)     break <pre>Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n</pre> In\u00a0[43]: Copied! <pre>from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n</pre> from openqdc.methods import QmMethod  # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]: <pre>{('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}</pre> In\u00a0[44]: Copied! <pre># Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n</pre> # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]: <pre>array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])</pre> In\u00a0[45]: Copied! <pre>import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n</pre> import matplotlib.pyplot as plt  from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])   <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01&lt;00:00, 459.21it/s]\n</pre> In\u00a0[46]: Copied! <pre>plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n</pre> plt.scatter(     embedding[:, 0],     embedding[:, 1],     c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar()  Out[46]: <pre>&lt;matplotlib.colorbar.Colorbar at 0x1554aa7bd820&gt;</pre>"},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"<p>If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.</p> <p>Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]</p> <p>Supported distance units: [\"ang\", \"nm\", \"bohr\"]</p>"},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"<p>The dictionary of the item contains different important keys:</p> <ul> <li>'positions' : numpy array of the 3d atomic positions (n x 3)</li> <li>'atomic_numbers': numpy array of the atomic numbers (n)</li> <li>'charges': numpy array of the formal charges for the molecule (n)</li> <li>'e0': isolated atom energy of the atoms in the molecule (n x n_level_of_theories)</li> <li>'energies': potential energy of the molecule (n_level_of_theries)</li> <li>'name': name or smiles (is present) of the molecule</li> <li>'subset': subset of the dataset the molecule belongs to</li> <li>'forces': if present, the forces on the atoms (n x 3 x n_level_of_theories_forces)</li> </ul>"},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"<p>The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.</p>"},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"<p>The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.</p> <p>$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$</p> <p>The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow</p>"},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"<p>openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"<p>OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.</p> <ul> <li>\ud83d\udc0d Simple pythonic API</li> <li>\ud83d\udd79\ufe0f  ML-Ready: all you manipulate are <code>torch.Tensor</code>,<code>jax.Array</code> or <code>numpy.Array</code>objects.</li> <li>\u269b\ufe0f Quantum Ready: The quantum methods are checked and standardized to provide addictional values.</li> <li>\u2705 Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.</li> <li>\ud83e\udde0 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).</li> <li>\ud83d\udcc8 Data: have access to 1.5+ billion datapoints</li> </ul> <p>Visit our website at https://openqdc.io .</p>"},{"location":"index.html#installation","title":"Installation","text":"<p>Use mamba:</p> <pre><code>conda install -c conda-forge openqdc\n</code></pre> <p>Tips: You can replace <code>conda</code> by <code>mamba</code>.</p> <p>Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: <code>pip install openqdc</code>.</p>"},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"<pre><code>from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n</code></pre>"},{"location":"index.html#how-to-cite","title":"How to cite","text":"<p>Please cite OpenQDC if you use it in your research: .</p>"},{"location":"index.html#compatibilities","title":"Compatibilities","text":"<p>OpenQDC is compatible with Python &gt;= 3.8 and is tested on Linux, MacOS and Windows.</p>"},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"<p>You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).</p>"},{"location":"cli.html#datasets","title":"Datasets","text":"<p>Print a formatted table of the available openQDC datasets and some informations.</p> <p>Usage:</p> <pre><code>openqdc datasets [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n</code></pre>"},{"location":"cli.html#cache","title":"Cache","text":"<p>Get the current local cache path of openQDC</p> <p>Usage:</p> <pre><code>openqdc cache [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n</code></pre>"},{"location":"cli.html#download","title":"Download","text":"<p>Download preprocessed ml-ready datasets from the main openQDC hub.</p> <p>Usage:</p> <pre><code>openqdc download DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n</code></pre> <p>Example:</p> <pre><code>openqdc download Spice\n</code></pre>"},{"location":"cli.html#fetch","title":"Fetch","text":"<p>Download the raw datasets files from the main openQDC hub</p> <p>Note:</p> <pre><code>Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n</code></pre> <p>Usage:</p> <pre><code>openqdc fetch DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n</code></pre> <p>Example:</p> <pre><code>openqdc fetch Spice\n</code></pre>"},{"location":"cli.html#preprocess","title":"Preprocess","text":"<p>Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.</p> <p>Usage:</p> <pre><code>openqdc preprocess DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n</code></pre> <p>Example:</p> <pre><code>openqdc preprocess Spice QMugs\n</code></pre>"},{"location":"cli.html#upload","title":"Upload","text":"<p>Upload a preprocessed dataset to the remote storage</p> <p>Usage:</p> <pre><code>openqdc upload DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n</code></pre> <p>Example:</p> <pre><code>openqdc upload Spice --overwrite\n</code></pre>"},{"location":"cli.html#convert","title":"Convert","text":"<p>Convert a preprocessed dataset from a memmap dataset to a zarr dataset.</p> <p>Usage:</p> <pre><code>openqdc convert DATASETS... [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>--help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n</code></pre>"},{"location":"contribute.html","title":"Contribute","text":"<p>The below documents the development lifecycle of OpenQDC.</p>"},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"<pre><code>mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n</code></pre>"},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"<pre><code>pre-commit install\npre-commit run --all-files\n</code></pre>"},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"<p>OpenQDC uses Github Actions to:</p> <ul> <li>Build and test <code>openQDC</code>.<ul> <li>Multiple combinations of OS and Python versions are tested.</li> </ul> </li> <li>Check the code:<ul> <li>Formatting with <code>black</code>.</li> <li>Static type check with <code>mypy</code>.</li> <li>Modules import formatting with <code>isort</code>.</li> <li>Pre-commit hooks.</li> </ul> </li> <li>Documentation:<ul> <li>Google docstring format.</li> <li>build and deploy the documentation on <code>main</code> and for every new git tag.</li> </ul> </li> </ul>"},{"location":"contribute.html#run-tests","title":"Run tests","text":"<pre><code>pytest\n</code></pre>"},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"<p>You can build and serve the documentation locally with:</p> <pre><code># Build and serve the doc\nmike serve\n</code></pre> <p>or with</p> <pre><code>mkdocs serve\n</code></pre>"},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"<p>The doc is built for eash push on <code>main</code> and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.</p>"},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"<p>For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:</p> <ul> <li>(M, 5) for atomic numbers (1), charges (1), and positions (3) of individual geometries;</li> </ul> <ul> <li>(N, 2) for the beginning and end indices of each geometry in the previous array;</li> </ul> <ul> <li>(N, ne) for the energy labels of each geometry, extendable to store other geometry-level QM properties such as HOMO-LUMO gap;</li> </ul> <ul> <li>(M, nf , 3) for the force labels of each geometry, extendable to store other atom-level QM properties.</li> </ul> <p>The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.</p> <p></p>"},{"location":"data_storage.html#formats","title":"Formats","text":"<p>We currently support the following formats:</p> <p>1) Zarr : https://zarr.readthedocs.io/en/stable/index.html</p> <p>2) Memmap : https://numpy.org/doc/stable/index.html</p>"},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"<p>Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:</p> <ol> <li>Opening a PR to add a new dataset</li> <li>Request a new dataset through Google Form</li> </ol>"},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"<p>Implement your dataset in the OpenQDC repository by following the guidelines below:</p>"},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":"<ul> <li>The dataset class should be implemented in the <code>openqdc/datasets</code> directory.</li> <li>The dataset class should inherit from the <code>openqdc.datasets.base.BaseDataset</code> class.</li> <li>Add your <code>dataset.py</code> file to the <code>openqdc/datasets/potential</code> or <code>openqdc/datasets/interaction/</code> directory based on the type of energy.</li> <li>Implement the following for your dataset:<ul> <li>Add the metadata of the dataset:<ul> <li>Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.</li> <li><code>__links__</code>: Dictionary of name and link to download the dataset.</li> <li><code>__name__</code>: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.</li> <li>The original units for the dataset <code>__energy_unit__</code> and <code>__distance_unit__</code>.</li> <li><code>__force_mask__</code>: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.</li> <li><code>__energy_methods__</code>: List of the <code>QmMethod</code> methods present in the dataset.</li> </ul> </li> <li><code>read_raw_entries(self)</code> -&gt; <code>List[Dict[str, Any]]</code>: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:<ul> <li><code>atomic_inputs</code> : Atomic inputs of the molecule. numpy.Float32.</li> <li><code>name</code>: Atomic numbers of the atoms in the molecule. numpy.Object.</li> <li><code>subset</code>: Positions of the atoms in the molecule.  numpy.Object.</li> <li><code>energies</code>: Energies of the molecule. numpy.Float64.</li> <li><code>n_atoms</code>: Number of atoms in the molecule. numpy.Int32</li> <li><code>forces</code>: Forces of the molecule. [Optional] numpy.Float32.</li> </ul> </li> <li>Add the dataset import to the <code>openqdc/datasets/&lt;type_of_dataset&gt;/__init__.py</code> file and to <code>openqdc/__init__.py</code>.</li> </ul> </li> </ul>"},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"<p>Try to run the openQDC CLI pipeline with the dataset you implemented.</p> <p>Run the following command to download the dataset:</p> <ul> <li>Fetch the dataset files <pre><code>openqdc fetch DATASET_NAME\n</code></pre></li> </ul> <ul> <li>Preprocess the dataset <pre><code>openqdc preprocess DATASET_NAME\n</code></pre></li> </ul> <ul> <li>Load it on python and check if the dataset is correctly loaded. <pre><code>from openqdc import DATASET_NAME\nds=DATASET_NAME()\n</code></pre></li> </ul> <p>If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.</p> <ul> <li>Select for your PR the <code>dataset</code> label.</li> </ul> <p>Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.</p>"},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"<p>Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here</p> <p>As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.</p>"},{"location":"datasets.html","title":"Overview of Datasets","text":"<p>We provide support for the following publicly available QM Datasets.</p> Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"<pre><code>Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n</code></pre>"},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"<p>OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.</p>"},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"<p>To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies <code>e0</code> for each QM method.</p>"},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"<p>We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies <code>e0</code> physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.</p>"},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"<p><code>e0</code> energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.</p>"},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"<p><code>e0</code> energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The <code>e0</code> energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.</p>"},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"<p>OpenQDC has been designed to be used with a single import:</p> <pre><code>import openqdc as qdc\ndataset = qdc.QM9()\n</code></pre> <p>All <code>openQDC</code> functions are available under <code>qdc</code>. Or if you want to directly import a specific dataset:</p> <pre><code>from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n</code></pre> <p>Or if you prefer handling <code>ase.Atoms</code> objects:</p> <pre><code>dataset.get_ase_atoms(0)\n</code></pre>"},{"location":"usage.html#iterators","title":"Iterators","text":"<p>OpenQDC provides a simple way to get the data as iterators:</p> <pre><code>for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n</code></pre> <p>or if you want to just iterate over the data:</p> <pre><code>for data in dataset:\n    print(data) # dict of arrays\n    break\n</code></pre>"},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"<p>OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during <code>import openqdc as qdc</code>. In case of trouble you can always disable lazy loading by setting the environment variable <code>OPENQDC_DISABLE_LAZY_LOADING</code> to <code>1</code>.</p>"},{"location":"API/basedataset.html","title":"BaseDataset","text":"<p>The BaseDataset defining shared functionality between all datasets.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"<code>BaseDataset</code>","text":"<p>               Bases: <code>DatasetPropertyMixIn</code></p> <p>Base class for datasets in the openQDC package.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -&gt; None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -&gt; None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -&gt; None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -&gt; List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self) -&gt; AtomEnergies:\n        \"\"\"\n        Property to get the object that dispatched the isolated atom energies of the QM methods.\n\n        Returns:\n            Object wrapping the isolated atom energies of the QM methods.\n        \"\"\"\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) &gt; 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # &lt; 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -&gt; Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -&gt; bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -&gt; bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -&gt; Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 &lt; n_samples &lt; 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -&gt; Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -&gt; Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -&gt; Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"<code>__force_methods__</code>  <code>property</code>","text":"<p>For backward compatibility. To be removed in the future.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.e0s_dispatcher","title":"<code>e0s_dispatcher: AtomEnergies</code>  <code>property</code>","text":"<p>Property to get the object that dispatched the isolated atom energies of the QM methods.</p> <p>Returns:</p> Type Description <code>AtomEnergies</code> <p>Object wrapping the isolated atom energies of the QM methods.</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"<code>energy_methods: List[str]</code>  <code>property</code>","text":"<p>Return the string version of the energy methods</p>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"<code>__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})</code>","text":"<p>Parameters:</p> Name Type Description Default <code>energy_unit</code> <code>Optional[str]</code> <p>Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]</p> <code>None</code> <code>distance_unit</code> <code>Optional[str]</code> <p>Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]</p> <code>None</code> <code>array_format</code> <code>str</code> <p>Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]</p> <code>'numpy'</code> <code>energy_type</code> <code>Optional[str]</code> <p>Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]</p> <code>'formation'</code> <code>overwrite_local_cache</code> <code>bool</code> <p>Whether to overwrite the locally cached dataset.</p> <code>False</code> <code>cache_dir</code> <code>Optional[str]</code> <p>Cache directory location. Defaults to \"~/.cache/openqdc\"</p> <code>None</code> <code>recompute_statistics</code> <code>bool</code> <p>Whether to recompute the statistics of the dataset.</p> <code>False</code> <code>transform</code> <code>Optional[Callable]</code> <p>transformation to apply to the getitem calls</p> <code>None</code> <code>regressor_kwargs</code> <code>Dict</code> <p>Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]</p> <code>{'solver_type': 'linear', 'sub_sample': None, 'stride': 1}</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -&gt; None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"<code>as_iter(atoms=False, energy_method=0)</code>","text":"<p>Return the dataset as an iterator.</p> <p>Parameters:</p> Name Type Description Default <code>atoms</code> <code>bool</code> <p>Whether to return the items as ASE atoms object, by default False</p> <code>False</code> <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <p>Returns:</p> Type Description <code>Iterable</code> <p>Iterator of the dataset</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def as_iter(self, atoms: bool = False, energy_method: int = 0) -&gt; Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"<code>calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)</code>","text":"<p>Compute the descriptors for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>descriptor_name</code> <code>str</code> <p>Name of the descriptor to use. Supported descriptors are [\"soap\"]</p> <code>'soap'</code> <code>chemical_species</code> <code>Optional[List[str]]</code> <p>List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.</p> <code>None</code> <code>n_samples</code> <code>Optional[Union[List[int], int, float]]</code> <p>Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.</p> <code>None</code> <code>progress</code> <code>bool</code> <p>Whether to show a progress bar, by default True.</p> <code>True</code> <code>**descriptor_kwargs</code> <p>dict Keyword arguments to pass to the descriptor instantiation of the model.</p> <code>{}</code> <p>Returns:</p> Type Description <code>Dict[str, ndarray]</code> <p>Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>@requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -&gt; Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"<code>collate_list(list_entries)</code>","text":"<p>Collate a list of entries into a single dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>list_entries</code> <code>List[Dict]</code> <p>List of dictionaries containing the entries to collate.</p> required <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary containing the collated entries.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def collate_list(self, list_entries: List[Dict]) -&gt; Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"<code>get_ase_atoms(idx, energy_method=0, ext=True)</code>","text":"<p>Get the ASE atoms object for the entry at index idx.</p> <p>Parameters:</p> Name Type Description Default <code>idx</code> <code>int</code> <p>Index of the entry.</p> required <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>ext</code> <code>bool</code> <p>Whether to include additional informations</p> <code>True</code> <p>Returns:</p> Type Description <code>Atoms</code> <p>ASE atoms object</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -&gt; Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"<code>get_statistics(return_none=True)</code>","text":"<p>Get the converted statistics of the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>return_none</code> <p>Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0</p> <code>True</code> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary containing the statistics of the dataset</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def get_statistics(self, return_none: bool = True) -&gt; Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"<code>is_cached()</code>","text":"<p>Check if the dataset is cached locally.</p> <p>Returns:</p> Type Description <code>bool</code> <p>True if the dataset is cached locally, False otherwise.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def is_cached(self) -&gt; bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"<code>is_preprocessed()</code>","text":"<p>Check if the dataset is preprocessed and available online or locally.</p> <p>Returns:</p> Type Description <code>bool</code> <p>True if the dataset is available remotely or locally, False otherwise.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def is_preprocessed(self) -&gt; bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"<code>no_init()</code>  <code>classmethod</code>","text":"<p>Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>@classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"<code>preprocess(upload=False, overwrite=True, as_zarr=True)</code>","text":"<p>Preprocess the dataset and save it.</p> <p>Parameters:</p> Name Type Description Default <code>upload</code> <code>bool</code> <p>Whether to upload the preprocessed data to the remote storage or only saving it locally.</p> <code>False</code> <code>overwrite</code> <code>bool</code> <p>hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.</p> <code>True</code> <code>as_zarr</code> <code>bool</code> <p>Whether to save the data as zarr files</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"<code>read_raw_entries()</code>","text":"<p>Preprocess the raw (aka from the fetched source) into a list of dictionaries.</p> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"<code>save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)</code>","text":"<p>Save the preprocessed data to the cache directory and optionally upload it to the remote storage.</p> <p>Parameters:</p> Name Type Description Default <code>data_dict</code> <code>Dict[str, ndarray]</code> <p>Dictionary containing the preprocessed data.</p> required <code>upload</code> <code>bool</code> <p>Whether to upload the preprocessed data to the remote storage or only saving it locally.</p> <code>False</code> <code>overwrite</code> <code>bool</code> <p>Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"<code>save_xyz(idx, energy_method=0, path=None, ext=True)</code>","text":"<p>Save a single entry at index idx as an extxyz file.</p> <p>Parameters:</p> Name Type Description Default <code>idx</code> <code>int</code> <p>Index of the entry</p> required <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>path</code> <code>Optional[str]</code> <p>Path to save the xyz file. If None, the current working directory is used.</p> <code>None</code> <code>ext</code> <code>bool</code> <p>Whether to include additional informations like forces and other metadatas (extxyz format)</p> <code>True</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"<code>set_distance_unit(value)</code>","text":"<p>Set a new distance unit for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>value</code> <code>str</code> <p>New distance unit to set.</p> required Source code in <code>openqdc/datasets/base.py</code> <pre><code>def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"<code>set_energy_unit(value)</code>","text":"<p>Set a new energy unit for the dataset.</p> <p>Parameters:</p> Name Type Description Default <code>value</code> <code>str</code> <p>New energy unit to set.</p> required Source code in <code>openqdc/datasets/base.py</code> <pre><code>def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"<code>to_xyz(energy_method=0, path=None)</code>","text":"<p>Save dataset as single xyz file (extended xyz format).</p> <p>Parameters:</p> Name Type Description Default <code>energy_method</code> <code>int</code> <p>Index of the energy method to use</p> <code>0</code> <code>path</code> <code>Optional[str]</code> <p>Path to save the xyz file</p> <code>None</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n</code></pre>"},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"<code>upload(overwrite=False, as_zarr=False)</code>","text":"<p>Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.</p> <p>Parameters:</p> Name Type Description Default <code>overwrite</code> <code>bool</code> <p>Whether to overwrite the remote data if it already exists</p> <code>False</code> <code>as_zarr</code> <code>bool</code> <p>Whether to upload the data as zarr files</p> <code>False</code> Source code in <code>openqdc/datasets/base.py</code> <pre><code>def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n</code></pre>"},{"location":"API/e0_dispatcher.html","title":"e0 Dispatcher","text":""},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies","title":"<code>AtomEnergies</code>","text":"<p>Manager class for interface with the isolated atom energies classes and providing the generals function to retrieve the data</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class AtomEnergies:\n    \"\"\"\n    Manager class for interface with the isolated atom energies classes\n    and providing the generals function to retrieve the data\n    \"\"\"\n\n    def __init__(self, data, **kwargs) -&gt; None:\n        self.atom_energies = data.energy_type\n        self.factory = dispatch_factory(data, **kwargs)\n\n    @property\n    def e0s_matrix(self) -&gt; np.ndarray:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_matrix\n\n    @property\n    def e0s_dict(self) -&gt; Dict[AtomSpecies, AtomEnergy]:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_dict\n\n    def __str__(self):\n        return f\"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}\"\n\n    def __repr__(self):\n        return str(self)\n\n    def __getitem__(self, item: AtomSpecies) -&gt; AtomEnergy:\n        \"\"\"\n        Retrieve a key from the isolated atom dictionary.\n        Item can be written as tuple(Symbol, charge),\n        tuple(Chemical number, charge). If no charge is passed,\n        it will be automatically set to 0.\n\n        Examples:\n            AtomEnergies[6], AtomEnergies[6,1], \\n\n            AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n            AtomEnergies[(\"C,1)]\n\n        Parameters:\n            item:\n                AtomSpecies object or tuple with the atom symbol and charge\n\n        Returns:\n            AtomEnergy object with the isolated atom energy\n        \"\"\"\n        try:\n            atom, charge = item[0], item[1]\n        except TypeError:\n            atom = item\n            charge = 0\n        except IndexError:\n            atom = item[0]\n            charge = 0\n        if not isinstance(atom, str):\n            atom = ATOM_SYMBOLS[atom]\n        return self.e0s_dict[(atom, charge)]\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_dict","title":"<code>e0s_dict: Dict[AtomSpecies, AtomEnergy]</code>  <code>property</code>","text":"<p>Return the isolated atom energies dictionary</p> <p>Returns:</p> Type Description <code>Dict[AtomSpecies, AtomEnergy]</code> <p>Dictionary with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_matrix","title":"<code>e0s_matrix: np.ndarray</code>  <code>property</code>","text":"<p>Return the isolated atom energies dictionary</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Matrix Array with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.__getitem__","title":"<code>__getitem__(item)</code>","text":"<p>Retrieve a key from the isolated atom dictionary. Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0.</p> <p>Examples:</p> <p>AtomEnergies[6], AtomEnergies[6,1], </p> <p>AtomEnergies[\"C\",1], AtomEnergies[(6,1)], </p> <p>AtomEnergies[(\"C,1)]</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <code>AtomSpecies</code> <p>AtomSpecies object or tuple with the atom symbol and charge</p> required <p>Returns:</p> Type Description <code>AtomEnergy</code> <p>AtomEnergy object with the isolated atom energy</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def __getitem__(self, item: AtomSpecies) -&gt; AtomEnergy:\n    \"\"\"\n    Retrieve a key from the isolated atom dictionary.\n    Item can be written as tuple(Symbol, charge),\n    tuple(Chemical number, charge). If no charge is passed,\n    it will be automatically set to 0.\n\n    Examples:\n        AtomEnergies[6], AtomEnergies[6,1], \\n\n        AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n        AtomEnergies[(\"C,1)]\n\n    Parameters:\n        item:\n            AtomSpecies object or tuple with the atom symbol and charge\n\n    Returns:\n        AtomEnergy object with the isolated atom energy\n    \"\"\"\n    try:\n        atom, charge = item[0], item[1]\n    except TypeError:\n        atom = item\n        charge = 0\n    except IndexError:\n        atom = item[0]\n        charge = 0\n    if not isinstance(atom, str):\n        atom = ATOM_SYMBOLS[atom]\n    return self.e0s_dict[(atom, charge)]\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy","title":"<code>AtomEnergy</code>  <code>dataclass</code>","text":"<p>Datastructure to store isolated atom energies and the std deviation associated to the value. By default the std will be 1 if no value was calculated or not available (formation energy case)</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>@dataclass\nclass AtomEnergy:\n    \"\"\"\n    Datastructure to store isolated atom energies\n    and the std deviation associated to the value.\n    By default the std will be 1 if no value was calculated\n    or not available (formation energy case)\n    \"\"\"\n\n    mean: np.array\n    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))\n\n    def __post_init__(self):\n        if not isinstance(self.mean, np.ndarray):\n            self.mean = np.array([self.mean], dtype=np.float32)\n\n    def append(self, other: \"AtomEnergy\"):\n        \"\"\"\n        Append the mean and std of another atom energy\n        \"\"\"\n        self.mean = np.append(self.mean, other.mean)\n        self.std = np.append(self.std, other.std)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy.append","title":"<code>append(other)</code>","text":"<p>Append the mean and std of another atom energy</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def append(self, other: \"AtomEnergy\"):\n    \"\"\"\n    Append the mean and std of another atom energy\n    \"\"\"\n    self.mean = np.append(self.mean, other.mean)\n    self.std = np.append(self.std, other.std)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomSpecies","title":"<code>AtomSpecies</code>  <code>dataclass</code>","text":"<p>Structure that defines a tuple of chemical specie and charge and provide hash and automatic conversion from atom number to checmical symbol</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>@dataclass(frozen=False, eq=True)\nclass AtomSpecies:\n    \"\"\"\n    Structure that defines a tuple of chemical specie and charge\n    and provide hash and automatic conversion from atom number to\n    checmical symbol\n    \"\"\"\n\n    symbol: Union[str, int]\n    charge: int = 0\n\n    def __post_init__(self):\n        if not isinstance(self.symbol, str):\n            self.symbol = ATOM_SYMBOLS[self.symbol]\n        self.number = ATOMIC_NUMBERS[self.symbol]\n\n    def __hash__(self):\n        return hash((self.symbol, self.charge))\n\n    def __eq__(self, other):\n        if not isinstance(other, AtomSpecies):\n            symbol, charge = other[0], other[1]\n            other = AtomSpecies(symbol=symbol, charge=charge)\n        return (self.number, self.charge) == (other.number, other.charge)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface","title":"<code>IsolatedEnergyInterface</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class that defines the interface for the different implementation of an isolated atom energy value</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class IsolatedEnergyInterface(ABC):\n    \"\"\"\n    Abstract class that defines the interface for the\n    different implementation of an isolated atom energy value\n    \"\"\"\n\n    def __init__(self, data, **kwargs):\n        \"\"\"\n        Parameters:\n            data : openqdc.datasets.Dataset\n                Dataset object that contains the information\n                about the isolated atom energies. Info will be passed\n                by references\n            kwargs : dict\n                Additional arguments that will be passed to the\n                selected energy class. Mostly used for regression\n                to pass the regressor_kwargs.\n        \"\"\"\n        self._e0_matrixs = []\n        self._e0_dict = None\n        self.kwargs = kwargs\n        self.data = data\n        self._post_init()\n\n    @property\n    def refit(self) -&gt; bool:\n        return self.data.refit_e0s\n\n    @abstractmethod\n    def _post_init(self):\n        \"\"\"\n        Main method to fetch/compute/recomputed the isolated atom energies.\n        Need to be implemented in all child classes.\n        \"\"\"\n        pass\n\n    def __len__(self):\n        return len(self.data.energy_methods)\n\n    @property\n    def e0_matrix(self) -&gt; np.ndarray:\n        \"\"\"\n        Return the isolated atom energies matrixes\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return np.array(self._e0_matrixs)\n\n    @property\n    def e0_dict(self) -&gt; Dict:\n        \"\"\"\n        Return the isolated atom energies dict\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n\n        return self._e0s_dict\n\n    def __str__(self) -&gt; str:\n        return self.__class__.__name__.lower()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_dict","title":"<code>e0_dict: Dict</code>  <code>property</code>","text":"<p>Return the isolated atom energies dict</p> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_matrix","title":"<code>e0_matrix: np.ndarray</code>  <code>property</code>","text":"<p>Return the isolated atom energies matrixes</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Matrix Array with the isolated atom energies</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.__init__","title":"<code>__init__(data, **kwargs)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>data</code> <p>openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references</p> required <code>kwargs</code> <p>dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.</p> <code>{}</code> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def __init__(self, data, **kwargs):\n    \"\"\"\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n    \"\"\"\n    self._e0_matrixs = []\n    self._e0_dict = None\n    self.kwargs = kwargs\n    self.data = data\n    self._post_init()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.NullEnergy","title":"<code>NullEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that returns a null (zeros) matrix for the isolated atom energies in case of no energies are available.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class NullEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a null (zeros) matrix for the isolated atom energies in case\n    of no energies are available.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for _ in self.data.__energy_methods__:\n            for key, values in PotentialMethod.NONE.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]\n        self._assembly_e0_dict()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.PhysicalEnergy","title":"<code>PhysicalEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that returns a physical (SE,DFT,etc) isolated atom energies.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class PhysicalEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a physical (SE,DFT,etc) isolated atom energies.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for method in self.data.__energy_methods__:\n            for key, values in method.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]\n        self._assembly_e0_dict()\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy","title":"<code>RegressionEnergy</code>","text":"<p>               Bases: <code>IsolatedEnergyInterface</code></p> <p>Class that compute and returns the regressed isolated atom energies.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>class RegressionEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that compute and returns the regressed isolated atom energies.\n    \"\"\"\n\n    def _post_init(self):\n        if not self.attempt_load() or self.refit:\n            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)\n            E0s, cov = self._compute_regression_e0s()\n            self._set_lin_atom_species_dict(E0s, cov)\n        self._set_linear_e0s()\n\n    def _compute_regression_e0s(self) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Try to compute the regressed isolated atom energies.\n        raise an error if the regression fails.\n        return the regressed isolated atom energies and the uncertainty values.\n\n        Returns:\n            Tuple with the regressed isolated atom energies and the uncertainty values of the regression\n            if available.\n        \"\"\"\n        try:\n            E0s, cov = self.regressor.solve()\n        except np.linalg.LinAlgError:\n            logger.warning(f\"Failed to compute E0s using {self.regressor.solver_type} regression.\")\n            raise np.linalg.LinAlgError\n        return E0s, cov\n\n    def _set_lin_atom_species_dict(self, E0s, covs) -&gt; None:\n        \"\"\"\n        Set the regressed isolated atom energies in a dictionary format\n        and Save the values in a pickle file to easy loading.\n        \"\"\"\n        atomic_energies_dict = {}\n        for i, z in enumerate(self.regressor.numbers):\n            for charge in range(-10, 11):\n                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])\n            # atomic_energies_dict[z] = E0s[i]\n        self._e0s_dict = atomic_energies_dict\n        self.save_e0s()\n\n    def _set_linear_e0s(self) -&gt; None:\n        \"\"\"\n        Transform the e0s dictionary into the correct e0s\n        matrix format.\n        \"\"\"\n        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]\n        for z, e0 in self._e0s_dict.items():\n            for i in range(len(self)):\n                # new_e0s[i][z, :] = e0[i]\n                new_e0s[i][z.number, z.charge] = e0.mean[i]\n            # for atom_sp, values in\n        self._e0_matrixs = new_e0s\n\n    def save_e0s(self) -&gt; None:\n        \"\"\"\n        Save the regressed isolated atom energies in a pickle file.\n        \"\"\"\n        save_pkl(self._e0s_dict, self.preprocess_path)\n\n    def attempt_load(self) -&gt; bool:\n        \"\"\"\n        Try to load the regressed isolated atom energies from the\n        object pickle file and return the success of the operation.\n        \"\"\"\n        try:\n            self._e0s_dict = load_pkl(self.preprocess_path)\n            logger.info(f\"Found energy file for {str(self)}.\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Energy file for {str(self)} not found.\")\n            return False\n\n    @property\n    def preprocess_path(self):\n        \"\"\"\n        Return the path to the object pickle file.\n        \"\"\"\n        path = p_join(self.data.root, \"preprocessed\", str(self) + \".pkl\")\n        return path\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.preprocess_path","title":"<code>preprocess_path</code>  <code>property</code>","text":"<p>Return the path to the object pickle file.</p>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.attempt_load","title":"<code>attempt_load()</code>","text":"<p>Try to load the regressed isolated atom energies from the object pickle file and return the success of the operation.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def attempt_load(self) -&gt; bool:\n    \"\"\"\n    Try to load the regressed isolated atom energies from the\n    object pickle file and return the success of the operation.\n    \"\"\"\n    try:\n        self._e0s_dict = load_pkl(self.preprocess_path)\n        logger.info(f\"Found energy file for {str(self)}.\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Energy file for {str(self)} not found.\")\n        return False\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.save_e0s","title":"<code>save_e0s()</code>","text":"<p>Save the regressed isolated atom energies in a pickle file.</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def save_e0s(self) -&gt; None:\n    \"\"\"\n    Save the regressed isolated atom energies in a pickle file.\n    \"\"\"\n    save_pkl(self._e0s_dict, self.preprocess_path)\n</code></pre>"},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.dispatch_factory","title":"<code>dispatch_factory(data, **kwargs)</code>","text":"<p>Factory function that select the correct energy class for the fetching/calculation of isolated atom energies.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <p>openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references</p> required <code>kwargs</code> <p>dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.</p> <code>{}</code> <p>Returns:</p> Type Description <code>IsolatedEnergyInterface</code> <p>Initialized IsolatedEnergyInterface-like object</p> Source code in <code>openqdc/datasets/energies.py</code> <pre><code>def dispatch_factory(data: Any, **kwargs: Dict) -&gt; \"IsolatedEnergyInterface\":\n    \"\"\"\n    Factory function that select the correct\n    energy class for the fetching/calculation\n    of isolated atom energies.\n\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n\n    Returns:\n        Initialized IsolatedEnergyInterface-like object\n    \"\"\"\n    if data.energy_type == \"formation\":\n        return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"regression\":\n        try:\n            return RegressionEnergy(data, **kwargs)\n        except np.linalg.LinAlgError:\n            logger.warning(\"Error! Using physical energies instead.\")\n            return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"null\":\n        return NullEnergy(data, **kwargs)\n</code></pre>"},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"<code>GeneralStructure</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract Factory class for datasets type in the openQDC package.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -&gt; Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -&gt; str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -&gt; List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -&gt; Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -&gt; any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"<code>load_fn: Callable</code>  <code>abstractmethod</code> <code>property</code>","text":"<p>Function to use for loading the data. Must be implemented by the child class.</p> <p>Returns:</p> Type Description <code>Callable</code> <p>the function to use for loading the data</p>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"<code>add_extension(filename)</code>","text":"<p>Add the correct extension to a filename</p> <p>Parameters:</p> Name Type Description Default <code>filename</code> <code>str</code> <p>the filename to add the extension to</p> required <p>Returns:</p> Type Description <code>str</code> <p>the filename with the extension</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def add_extension(self, filename: str) -&gt; str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"<code>join_and_ext(path, filename)</code>","text":"<p>Join a path and a filename and add the correct extension.</p> <p>Parameters:</p> Name Type Description Default <code>path</code> <code>Union[str, PathLike]</code> <p>the path to join</p> required <code>filename</code> <code>str</code> <p>the filename to join</p> required <p>Returns:</p> Type Description <code>Union[str, PathLike]</code> <p>the joined path with the correct extension</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def join_and_ext(self, path: Union[str, PathLike], filename: str) -&gt; Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"<code>load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)</code>","text":"<p>Main method to load the data from a filetype structure like memmap or zarr.</p> <p>Parameters:</p> Name Type Description Default <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>data_types</code> <code>Dict[str, dtype]</code> <p>dictionary of data types for each key</p> required <code>data_shapes</code> <code>Dict[str, Tuple[int, int]]</code> <p>dictionary of shapes for each key</p> required <code>extra_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra data file</p> required <code>overwrite</code> <code>bool</code> <p>whether to overwrite the local cache</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"<code>load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite)</code>  <code>abstractmethod</code>","text":"<p>Load extra files required to define other types of data. Must be implemented by the child class.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>Dict[str, ndarray]</code> <p>dictionary of data to load</p> required <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>pkl_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra files</p> required <code>overwrite</code> <code>bool</code> <p>whether to overwrite the local cache</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>@abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"<code>save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types)</code>  <code>abstractmethod</code>","text":"<p>Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.</p> <p>Parameters:</p> Name Type Description Default <code>preprocess_path</code> <code>Union[str, PathLike]</code> <p>path to the preprocessed data file</p> required <code>data_keys</code> <code>List[str]</code> <p>list of keys to load from the data file</p> required <code>data_dict</code> <code>Dict[str, ndarray]</code> <p>dictionary of data to save</p> required <code>extra_data_keys</code> <code>List[str]</code> <p>list of keys to load from the extra data file</p> required <code>extra_data_types</code> <code>Dict[str, type]</code> <p>dictionary of data types for each key</p> required Source code in <code>openqdc/datasets/structure.py</code> <pre><code>@abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -&gt; List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"<code>unpack(data)</code>","text":"<p>Unpack the data from the loaded file.</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>any</code> <p>the data to unpack</p> required <p>Returns:</p> Type Description <code>any</code> <p>the unpacked data</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>def unpack(self, data: any) -&gt; any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"<code>MemMapDataset</code>","text":"<p>               Bases: <code>GeneralStructure</code></p> <p>Dataset structure for memory-mapped numpy arrays and props.pkl files.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -&gt; List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n</code></pre>"},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"<code>ZarrDataset</code>","text":"<p>               Bases: <code>GeneralStructure</code></p> <p>Dataset structure for zarr files.</p> Source code in <code>openqdc/datasets/structure.py</code> <pre><code>class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -&gt; List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n</code></pre>"},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"<code>InteractionMethod</code>","text":"<p>               Bases: <code>QmMethod</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get an empty atomization energy dictionary because Interaction methods don't require this</p>"},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"<code>PotentialMethod</code>","text":"<p>               Bases: <code>QmMethod</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get the atomization energy dictionary</p>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"<code>QmMethod</code>","text":"<p>               Bases: <code>Enum</code></p> Source code in <code>openqdc/methods/enums.py</code> <pre><code>class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"<code>atom_energies_dict</code>  <code>property</code>","text":"<p>Get the atomization energy dictionary</p>"},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"<code>atom_energies_matrix</code>  <code>property</code>","text":"<p>Get the atomization energy matrix</p>"},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"<code>to_e_matrix(atom_energies)</code>","text":"<p>Get the matrix of isolated atom energies for a dict of non-null values calculates</p> <p>Parameters:</p> Name Type Description Default <code>atom_energies</code> <code>Dict</code> <p>Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values</p> required <p>np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)</p> Type Description <code>ndarray</code> <p>Matrix containing the isolated atom energies for each atom and charge written in the form:</p> <pre><code>        |   | -2 | -1 | 0 | +1 | +2 | &lt;- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n</code></pre> Source code in <code>openqdc/methods/atom_energies.py</code> <pre><code>def to_e_matrix(atom_energies: Dict) -&gt; np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | &lt;- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) &gt; 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n</code></pre>"},{"location":"API/properties.html","title":"Defined properties for datasets","text":""},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn","title":"<code>DatasetPropertyMixIn</code>","text":"<p>Mixin class for BaseDataset class to add properties that are common to all datasets.</p> Source code in <code>openqdc/datasets/properties.py</code> <pre><code>class DatasetPropertyMixIn:\n    \"\"\"\n    Mixin class for BaseDataset class to add\n    properties that are common to all datasets.\n    \"\"\"\n\n    @property\n    def atoms_per_molecules(self):\n        try:\n            if hasattr(self, \"_n_atoms\"):\n                return self._n_atoms\n            self._n_atoms = self.data[\"n_atoms\"]\n            return self._n_atoms\n        except:  # noqa\n            return None\n\n    @property\n    def _stats(self):\n        return self.__stats__\n\n    def _compute_average_nb_atoms(self):\n        self.__average_nb_atoms__ = np.mean(self.data[\"n_atoms\"])\n\n    @property\n    def average_n_atoms(self) -&gt; int:\n        \"\"\"\n        Average number of atoms in a molecule in the dataset.\n\n        Returns:\n            Average number of atoms in a molecule in the dataset.\n        \"\"\"\n        if self.__average_nb_atoms__ is None:\n            raise StatisticsNotAvailableError(self.__name__)\n        return self.__average_nb_atoms__\n\n    @property\n    def numbers(self) -&gt; np.ndarray:\n        \"\"\"\n        Unique atomic numbers in the dataset\n\n        Returns:\n            Array of the unique atomic numbers in the dataset\n        \"\"\"\n        if hasattr(self, \"_numbers\"):\n            return self._numbers\n        self._numbers = pd.unique(self.data[\"atomic_inputs\"][..., 0]).astype(np.int32)\n        return self._numbers\n\n    @property\n    def charges(self) -&gt; np.ndarray:\n        \"\"\"\n        Unique charges in the dataset\n\n        Returns:\n            Array of the unique charges in the dataset\n        \"\"\"\n        if hasattr(self, \"_charges\"):\n            return self._charges\n        self._charges = np.unique(self.data[\"atomic_inputs\"][..., :2], axis=0).astype(np.int32)\n        return self._charges\n\n    @property\n    def min_max_charges(self) -&gt; Tuple[int, int]:\n        \"\"\"\n        Minimum and maximum charges in the dataset\n\n        Returns:\n            (min_charge, max_charge)\n        \"\"\"\n        if hasattr(self, \"_min_max_charges\"):\n            return self._min_max_charges\n        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])\n        return self._min_max_charges\n\n    @property\n    def chemical_species(self) -&gt; np.ndarray:\n        \"\"\"\n        Chemical symbols in the dataset\n\n        Returns:\n            Array of the chemical symbols in the dataset\n        \"\"\"\n        return np.array(ATOM_SYMBOLS)[self.numbers]\n</code></pre>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.average_n_atoms","title":"<code>average_n_atoms: int</code>  <code>property</code>","text":"<p>Average number of atoms in a molecule in the dataset.</p> <p>Returns:</p> Type Description <code>int</code> <p>Average number of atoms in a molecule in the dataset.</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.charges","title":"<code>charges: np.ndarray</code>  <code>property</code>","text":"<p>Unique charges in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the unique charges in the dataset</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.chemical_species","title":"<code>chemical_species: np.ndarray</code>  <code>property</code>","text":"<p>Chemical symbols in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the chemical symbols in the dataset</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.min_max_charges","title":"<code>min_max_charges: Tuple[int, int]</code>  <code>property</code>","text":"<p>Minimum and maximum charges in the dataset</p> <p>Returns:</p> Type Description <code>Tuple[int, int]</code> <p>(min_charge, max_charge)</p>"},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.numbers","title":"<code>numbers: np.ndarray</code>  <code>property</code>","text":"<p>Unique atomic numbers in the dataset</p> <p>Returns:</p> Type Description <code>ndarray</code> <p>Array of the unique atomic numbers in the dataset</p>"},{"location":"API/regressor.html","title":"Normalization regressor","text":"<p>Linear Atom Energies regression utilities.</p>"},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"<code>LinearSolver</code>","text":"<p>               Bases: <code>Solver</code></p> <p>Linear regression solver.</p> Note <p>No Uncertainty associated as it is quite small.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"<code>Regressor</code>","text":"<p>Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:</p> <p>X = [n_samples, n_species] (number of atoms of each species per sample)</p> <p>Y = [n_samples, ] (energies)</p> <p>The regression problem is solved by solving the linear system X E0 = Y.</p> Example <p>For a sytem of 2 samples (H20, CH4)</p> <pre><code>n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -&gt; X = [2, 1, 0]\n\nCH4 = 4C, 1H -&gt; X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n</code></pre> <p>Linear system to solve</p> <pre><code>[[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n</code></pre> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -&gt; X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -&gt; X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If &gt;1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -&gt; \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample &lt; 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -&gt; Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"<code>__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)</code>","text":"<p>Regressor class for preparing and solving regression problem for isolated atom energies.</p> <p>Parameters:</p> Name Type Description Default <code>energies</code> <code>ndarray</code> <p>numpy array of energies in the shape (n_samples, n_energy_methods)</p> required <code>atomic_numbers</code> <code>ndarray</code> <p>numpy array of atomic numbers in the shape (n_atoms,)</p> required <code>position_idx_range</code> <code>ndarray</code> <p>array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset</p> required <code>solver_type</code> <code>str</code> <p>Type of solver to use. [\"linear\", \"ridge\"]</p> <code>'linear'</code> <code>stride</code> <code>int</code> <p>Stride to use for the regression.</p> <code>1</code> <code>subsample</code> <code>Optional[Union[float, int]]</code> <p>Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If &gt;1 it is interpreted as the number of samples to use.</p> <code>None</code> <code>remove_nan</code> <code>bool</code> <p>Sanitize the dataset by removing energies samples with NaN values.</p> <code>True</code> <code>*args</code> <code>any</code> <p>Additional arguments to be passed to the regressor.</p> <code>()</code> <code>**kwargs</code> <code>any</code> <p>Additional keyword arguments to be passed to the regressor.</p> <code>{}</code> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If &gt;1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"<code>from_openqdc_dataset(dataset, *args, **kwargs)</code>  <code>classmethod</code>","text":"<p>Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.</p> <p>Parameters:</p> Name Type Description Default <code>dataset</code> <code>any</code> <p>openqdc dataset object.</p> required <code>*args</code> <code>any</code> <p>Additional arguments to be passed to the regressor.</p> <code>()</code> <code>**kwargs</code> <code>any</code> <p>Additional keyword arguments to be passed to the regressor.</p> <code>{}</code> <p>Returns:</p> Type Description <code>Regressor</code> <p>Instance of the regressor class.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>@classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -&gt; \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"<code>solve()</code>","text":"<p>Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"<code>RidgeSolver</code>","text":"<p>               Bases: <code>Solver</code></p> <p>Ridge regression solver.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li-&gt;i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"<code>Solver</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class for regression solvers.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"<code>solve(X, Y)</code>  <code>abstractmethod</code> <code>staticmethod</code>","text":"<p>Main method to solve the regression problem. Must be implemented in all the subclasses.</p> <p>Parameters:</p> Name Type Description Default <code>X</code> <code>ndarray</code> <p>Input features of shape (n_samples, n_species)</p> required <code>Y</code> <code>ndarray</code> <p>Target values of shape (n_samples,) (energy values for the regression)</p> required <p>Returns:</p> Type Description <code>Tuple[ndarray, Optional[ndarray]]</code> <p>Tuple of predicted values and the estimated uncertainty.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>@staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -&gt; Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"<code>atom_standardization(X, y)</code>","text":"<p>Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n</code></pre>"},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"<code>non_nan_idxs(array)</code>","text":"<p>Return non nan indices of an array.</p> Source code in <code>openqdc/utils/regressor.py</code> <pre><code>def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n</code></pre>"},{"location":"API/statistics.html","title":"Statistics","text":""},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator","title":"<code>AbstractStatsCalculator</code>","text":"<p>               Bases: <code>ABC</code></p> <p>Abstract class that defines the interface for all the calculators object and the methods to compute the statistics.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class AbstractStatsCalculator(ABC):\n    \"\"\"\n    Abstract class that defines the interface for all\n    the calculators object and the methods to\n    compute the statistics.\n    \"\"\"\n\n    # State Dependencies of the calculator to skip part of the calculation\n    state_dependency = []\n    name = None\n\n    def __init__(\n        self,\n        name: str,\n        energy_type: Optional[str] = None,\n        force_recompute: bool = False,\n        energies: Optional[np.ndarray] = None,\n        n_atoms: Optional[np.ndarray] = None,\n        atom_species: Optional[np.ndarray] = None,\n        position_idx_range: Optional[np.ndarray] = None,\n        e0_matrix: Optional[np.ndarray] = None,\n        atom_charges: Optional[np.ndarray] = None,\n        forces: Optional[np.ndarray] = None,\n    ):\n        \"\"\"\n        Parameters:\n            name :\n                Name of the dataset for saving and loading.\n            energy_type :\n                Type of the energy for the computation of the statistics. Used for loading and saving.\n            force_recompute :\n                Flag to force the recomputation of the statistics\n            energies : n\n                Energies of the dataset\n            n_atoms :\n                Number of atoms in the dataset\n            atom_species :\n                Atomic species of the dataset\n            position_idx_range : n\n                Position index range of the dataset\n            e0_matrix :\n                Isolated atom energies matrix of the dataset\n            atom_charges :\n                Atomic charges of the dataset\n            forces :\n                Forces of the dataset\n        \"\"\"\n        self.name = name\n        self.energy_type = energy_type\n        self.force_recompute = force_recompute\n        self.energies = energies\n        self.forces = forces\n        self.position_idx_range = position_idx_range\n        self.e0_matrix = e0_matrix\n        self.n_atoms = n_atoms\n        self.atom_species_charges_tuple = (atom_species, atom_charges)\n        self._root = p_join(get_local_cache(), self.name)\n        if atom_species is not None and atom_charges is not None:\n            # by value not reference\n            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n\n    @property\n    def has_forces(self) -&gt; bool:\n        return self.forces is not None\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"statistics\", self.name + f\"_{str(self)}\" + \".pkl\")\n        return path\n\n    @property\n    def root(self):\n        \"\"\"\n        Path to the dataset folder\n        \"\"\"\n        return self._root\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset, recompute: bool = False):\n        \"\"\"\n        Create a calculator object from a dataset object.\n        \"\"\"\n        obj = cls(\n            name=dataset.__name__,\n            force_recompute=recompute,\n            energy_type=dataset.energy_type,\n            energies=dataset.data[\"energies\"],\n            forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n            n_atoms=dataset.data[\"n_atoms\"],\n            position_idx_range=dataset.data[\"position_idx_range\"],\n            atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n            atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n            e0_matrix=dataset.__isolated_atom_energies__,\n        )\n        obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n        return obj\n\n    @abstractmethod\n    def compute(self) -&gt; StatisticsResults:\n        \"\"\"\n        Abstract method to compute the statistics.\n        Must return a StatisticsResults object and be implemented\n        in all the childs\n        \"\"\"\n        raise NotImplementedError\n\n    def save_statistics(self) -&gt; None:\n        \"\"\"\n        Save statistics file to the dataset folder as a pkl file\n        \"\"\"\n        save_pkl(self.result, self.preprocess_path)\n\n    def attempt_load(self) -&gt; bool:\n        \"\"\"\n        Load precomputed statistics file and return the success of the operation\n        \"\"\"\n        try:\n            self.result = load_pkl(self.preprocess_path)\n            logger.info(f\"Statistics for {str(self)} loaded successfully\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n            return False\n\n    def _setup_deps(self, state: Dict) -&gt; None:\n        \"\"\"\n        Check if the dependencies of calculators are satisfied\n        from the state object and set the attributes of the calculator\n        to skip part of the calculation\n        \"\"\"\n        self.state = state\n        self.deps_satisfied = all([dep in state for dep in self.state_dependency])\n        if self.deps_satisfied:\n            for dep in self.state_dependency:\n                setattr(self, dep, state[dep])\n\n    def write_state(self, update: Dict) -&gt; None:\n        \"\"\"\n        Write/update the state dictionary with the update dictionary\n\n        update:\n            dictionary containing the update to the state\n        \"\"\"\n        self.state.update(update)\n\n    def run(self, state: Dict) -&gt; None:\n        \"\"\"\n        Main method to run the calculator.\n        Setup the dependencies from the state dictionary\n        Check if the statistics are already computed and load them or\n        recompute them\n        Save the statistics in the correct folder\n\n        state:\n            dictionary containing the state of the calculator\n        \"\"\"\n        self._setup_deps(state)\n        if self.force_recompute or not self.attempt_load():\n            self.result = self.compute()\n            self.save_statistics()\n\n    def __str__(self) -&gt; str:\n        return self.__class__.__name__.lower()\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.root","title":"<code>root</code>  <code>property</code>","text":"<p>Path to the dataset folder</p>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.__init__","title":"<code>__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>name</code> <p>Name of the dataset for saving and loading.</p> required <code>energy_type</code> <p>Type of the energy for the computation of the statistics. Used for loading and saving.</p> <code>None</code> <code>force_recompute</code> <p>Flag to force the recomputation of the statistics</p> <code>False</code> <code>energies</code> <p>n Energies of the dataset</p> <code>None</code> <code>n_atoms</code> <p>Number of atoms in the dataset</p> <code>None</code> <code>atom_species</code> <p>Atomic species of the dataset</p> <code>None</code> <code>position_idx_range</code> <p>n Position index range of the dataset</p> <code>None</code> <code>e0_matrix</code> <p>Isolated atom energies matrix of the dataset</p> <code>None</code> <code>atom_charges</code> <p>Atomic charges of the dataset</p> <code>None</code> <code>forces</code> <p>Forces of the dataset</p> <code>None</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def __init__(\n    self,\n    name: str,\n    energy_type: Optional[str] = None,\n    force_recompute: bool = False,\n    energies: Optional[np.ndarray] = None,\n    n_atoms: Optional[np.ndarray] = None,\n    atom_species: Optional[np.ndarray] = None,\n    position_idx_range: Optional[np.ndarray] = None,\n    e0_matrix: Optional[np.ndarray] = None,\n    atom_charges: Optional[np.ndarray] = None,\n    forces: Optional[np.ndarray] = None,\n):\n    \"\"\"\n    Parameters:\n        name :\n            Name of the dataset for saving and loading.\n        energy_type :\n            Type of the energy for the computation of the statistics. Used for loading and saving.\n        force_recompute :\n            Flag to force the recomputation of the statistics\n        energies : n\n            Energies of the dataset\n        n_atoms :\n            Number of atoms in the dataset\n        atom_species :\n            Atomic species of the dataset\n        position_idx_range : n\n            Position index range of the dataset\n        e0_matrix :\n            Isolated atom energies matrix of the dataset\n        atom_charges :\n            Atomic charges of the dataset\n        forces :\n            Forces of the dataset\n    \"\"\"\n    self.name = name\n    self.energy_type = energy_type\n    self.force_recompute = force_recompute\n    self.energies = energies\n    self.forces = forces\n    self.position_idx_range = position_idx_range\n    self.e0_matrix = e0_matrix\n    self.n_atoms = n_atoms\n    self.atom_species_charges_tuple = (atom_species, atom_charges)\n    self._root = p_join(get_local_cache(), self.name)\n    if atom_species is not None and atom_charges is not None:\n        # by value not reference\n        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.attempt_load","title":"<code>attempt_load()</code>","text":"<p>Load precomputed statistics file and return the success of the operation</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def attempt_load(self) -&gt; bool:\n    \"\"\"\n    Load precomputed statistics file and return the success of the operation\n    \"\"\"\n    try:\n        self.result = load_pkl(self.preprocess_path)\n        logger.info(f\"Statistics for {str(self)} loaded successfully\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n        return False\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.compute","title":"<code>compute()</code>  <code>abstractmethod</code>","text":"<p>Abstract method to compute the statistics. Must return a StatisticsResults object and be implemented in all the childs</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@abstractmethod\ndef compute(self) -&gt; StatisticsResults:\n    \"\"\"\n    Abstract method to compute the statistics.\n    Must return a StatisticsResults object and be implemented\n    in all the childs\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.from_openqdc_dataset","title":"<code>from_openqdc_dataset(dataset, recompute=False)</code>  <code>classmethod</code>","text":"<p>Create a calculator object from a dataset object.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@classmethod\ndef from_openqdc_dataset(cls, dataset, recompute: bool = False):\n    \"\"\"\n    Create a calculator object from a dataset object.\n    \"\"\"\n    obj = cls(\n        name=dataset.__name__,\n        force_recompute=recompute,\n        energy_type=dataset.energy_type,\n        energies=dataset.data[\"energies\"],\n        forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n        n_atoms=dataset.data[\"n_atoms\"],\n        position_idx_range=dataset.data[\"position_idx_range\"],\n        atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n        atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n        e0_matrix=dataset.__isolated_atom_energies__,\n    )\n    obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n    return obj\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.run","title":"<code>run(state)</code>","text":"<p>Main method to run the calculator. Setup the dependencies from the state dictionary Check if the statistics are already computed and load them or recompute them Save the statistics in the correct folder</p> state <p>dictionary containing the state of the calculator</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def run(self, state: Dict) -&gt; None:\n    \"\"\"\n    Main method to run the calculator.\n    Setup the dependencies from the state dictionary\n    Check if the statistics are already computed and load them or\n    recompute them\n    Save the statistics in the correct folder\n\n    state:\n        dictionary containing the state of the calculator\n    \"\"\"\n    self._setup_deps(state)\n    if self.force_recompute or not self.attempt_load():\n        self.result = self.compute()\n        self.save_statistics()\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.save_statistics","title":"<code>save_statistics()</code>","text":"<p>Save statistics file to the dataset folder as a pkl file</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def save_statistics(self) -&gt; None:\n    \"\"\"\n    Save statistics file to the dataset folder as a pkl file\n    \"\"\"\n    save_pkl(self.result, self.preprocess_path)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.write_state","title":"<code>write_state(update)</code>","text":"<p>Write/update the state dictionary with the update dictionary</p> update <p>dictionary containing the update to the state</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def write_state(self, update: Dict) -&gt; None:\n    \"\"\"\n    Write/update the state dictionary with the update dictionary\n\n    update:\n        dictionary containing the update to the state\n    \"\"\"\n    self.state.update(update)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.EnergyStatistics","title":"<code>EnergyStatistics</code>  <code>dataclass</code>","text":"<p>               Bases: <code>StatisticsResults</code></p> <p>Dataclass for energy related statistics</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@dataclass\nclass EnergyStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for energy related statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.ForceStatistics","title":"<code>ForceStatistics</code>  <code>dataclass</code>","text":"<p>               Bases: <code>StatisticsResults</code></p> <p>Dataclass for force statistics</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>@dataclass\nclass ForceStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for force statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n    component_mean: Optional[np.ndarray]\n    component_std: Optional[np.ndarray]\n    component_rms: Optional[np.ndarray]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.ForcesCalculatorStats","title":"<code>ForcesCalculatorStats</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code></p> <p>Forces statistics calculator class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class ForcesCalculatorStats(AbstractStatsCalculator):\n    \"\"\"\n    Forces statistics calculator class\n    \"\"\"\n\n    def compute(self) -&gt; ForceStatistics:\n        if not self.has_forces:\n            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)\n        converted_force_data = self.forces\n        num_methods = converted_force_data.shape[2]\n        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)\n        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)\n        component_mean = np.nanmean(converted_force_data, axis=0)\n        component_std = np.nanstd(converted_force_data, axis=0)\n        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))\n        return ForceStatistics(\n            mean=np.atleast_2d(mean),\n            std=np.atleast_2d(std),\n            component_mean=np.atleast_2d(component_mean),\n            component_std=np.atleast_2d(component_std),\n            component_rms=np.atleast_2d(component_rms),\n        )\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyInterface","title":"<code>FormationEnergyInterface</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code>, <code>ABC</code></p> <p>Formation Energy interface calculator class. Define the use of the dependency formation_energy in the compute method</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class FormationEnergyInterface(AbstractStatsCalculator, ABC):\n    \"\"\"\n    Formation Energy interface calculator class.\n    Define the use of the dependency formation_energy in the\n    compute method\n    \"\"\"\n\n    state_dependency = [\"formation_energy\"]\n\n    def compute(self) -&gt; EnergyStatistics:\n        # if the state has not the dependency satisfied\n        if not self.deps_satisfied:\n            # run the main computation\n            from openqdc.utils.constants import MAX_CHARGE\n\n            splits_idx = self.position_idx_range[:, 1]\n            s = np.array(self.atom_species_charges_tuple, dtype=int)\n            s[:, 1] += MAX_CHARGE\n            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]\n            converted_energy_data = self.energies\n            E = []\n            for i, matrix in enumerate(matrixs):\n                c = np.cumsum(np.append([0], matrix))[splits_idx]\n                c[1:] = c[1:] - c[:-1]\n                E.append(converted_energy_data[:, i] - c)\n        else:\n            # if the dependency is satisfied get the dependency\n            E = getattr(self, self.state_dependency[0])\n        self.write_state({self.state_dependency[0]: E})\n        E = np.array(E).T\n        return self._compute(E)\n\n    @abstractmethod\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        raise NotImplementedError\n\n    def __str__(self) -&gt; str:\n        # override the __str__ method to add the energy type to the name\n        # to differentiate between formation and regression type\n        return f\"{self.__class__.__name__.lower()}_{self.energy_type.lower()}\"\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyStats","title":"<code>FormationEnergyStats</code>","text":"<p>               Bases: <code>FormationEnergyInterface</code></p> <p>Formation Energy  calculator class.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class FormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        formation_E_mean = np.nanmean(energy, axis=0)\n        formation_E_std = np.nanstd(energy, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.PerAtomFormationEnergyStats","title":"<code>PerAtomFormationEnergyStats</code>","text":"<p>               Bases: <code>FormationEnergyInterface</code></p> <p>Per atom Formation Energy  calculator class.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class PerAtomFormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Per atom Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -&gt; EnergyStatistics:\n        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)\n        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager","title":"<code>StatisticManager</code>","text":"<p>Manager class that automatically handle the shared state between the statistic calculators</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class StatisticManager:\n    \"\"\"\n    Manager class that automatically handle the shared state between\n    the statistic calculators\n    \"\"\"\n\n    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n        \"\"\"\n        Parameters:\n            dataset : openqdc.datasets.base.BaseDataset\n                The dataset object to compute the statistics\n            recompute:\n                Flag to recompute the statistics\n            *statistic_calculators:\n                List of statistic calculators to run\n        \"\"\"\n        self._state = {}\n        self._results = {}\n        self._statistic_calculators = [\n            statistic_calculators.from_openqdc_dataset(dataset, recompute)\n            for statistic_calculators in statistic_calculators\n        ]\n\n    @property\n    def state(self) -&gt; Dict:\n        \"\"\"\n        Return the dictionary state of the manager\n\n        Returns:\n            State of the StatisticManager\n        \"\"\"\n        return self._state\n\n    def reset_state(self):\n        \"\"\"\n        Reset the state dictionary\n        \"\"\"\n        self._state = {}\n\n    def reset_results(self):\n        \"\"\"\n        Reset the results dictionary\n        \"\"\"\n        self._results = {}\n\n    def get_state(self, key: Optional[str] = None) -&gt; Optional[Any]:\n        \"\"\"\n        Return the value of the key in the state dictionary\n\n        Parameters:\n            key: str, default = None\n        Returns:\n            the value of the key in the state dictionary\n            or the whole state dictionary if key is None\n        \"\"\"\n        if key is None:\n            return self._state\n        return self._state.get(key, None)\n\n    def has_state(self, key: str) -&gt; bool:\n        \"\"\"\n        Check is state has key\n\n        Parameters:\n            key:\n                Key to check in the state dictionary\n\n        Returns:\n            True if the key is in the state dictionary\n        \"\"\"\n        return key in self._state\n\n    def get_results(self, as_dict: bool = False):\n        \"\"\"\n        Aggregate results from all the calculators\n\n        Parameters:\n            as_dict:\n                Flag to return the results as a dictionary\n        \"\"\"\n        results = deepcopy(self._results)\n        if as_dict:\n            return {k: v.as_dict() for k, v in results.items()}\n        return {k: v for k, v in self._results.items()}\n\n    def run_calculators(self):\n        \"\"\"\n        Run the saved calculators and save the results in the manager\n        \"\"\"\n        logger.info(\"Processing dataset statistics\")\n        for calculator in self._statistic_calculators:\n            calculator.run(self.state)\n            self._results[calculator.__class__.__name__] = calculator.result\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.state","title":"<code>state: Dict</code>  <code>property</code>","text":"<p>Return the dictionary state of the manager</p> <p>Returns:</p> Type Description <code>Dict</code> <p>State of the StatisticManager</p>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.__init__","title":"<code>__init__(dataset, recompute=False, *statistic_calculators)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>dataset</code> <p>openqdc.datasets.base.BaseDataset The dataset object to compute the statistics</p> required <code>recompute</code> <code>bool</code> <p>Flag to recompute the statistics</p> <code>False</code> <code>*statistic_calculators</code> <code>AbstractStatsCalculator</code> <p>List of statistic calculators to run</p> <code>()</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n    \"\"\"\n    Parameters:\n        dataset : openqdc.datasets.base.BaseDataset\n            The dataset object to compute the statistics\n        recompute:\n            Flag to recompute the statistics\n        *statistic_calculators:\n            List of statistic calculators to run\n    \"\"\"\n    self._state = {}\n    self._results = {}\n    self._statistic_calculators = [\n        statistic_calculators.from_openqdc_dataset(dataset, recompute)\n        for statistic_calculators in statistic_calculators\n    ]\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_results","title":"<code>get_results(as_dict=False)</code>","text":"<p>Aggregate results from all the calculators</p> <p>Parameters:</p> Name Type Description Default <code>as_dict</code> <code>bool</code> <p>Flag to return the results as a dictionary</p> <code>False</code> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def get_results(self, as_dict: bool = False):\n    \"\"\"\n    Aggregate results from all the calculators\n\n    Parameters:\n        as_dict:\n            Flag to return the results as a dictionary\n    \"\"\"\n    results = deepcopy(self._results)\n    if as_dict:\n        return {k: v.as_dict() for k, v in results.items()}\n    return {k: v for k, v in self._results.items()}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_state","title":"<code>get_state(key=None)</code>","text":"<p>Return the value of the key in the state dictionary</p> <p>Parameters:</p> Name Type Description Default <code>key</code> <code>Optional[str]</code> <p>str, default = None</p> <code>None</code> <p>Returns:     the value of the key in the state dictionary     or the whole state dictionary if key is None</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def get_state(self, key: Optional[str] = None) -&gt; Optional[Any]:\n    \"\"\"\n    Return the value of the key in the state dictionary\n\n    Parameters:\n        key: str, default = None\n    Returns:\n        the value of the key in the state dictionary\n        or the whole state dictionary if key is None\n    \"\"\"\n    if key is None:\n        return self._state\n    return self._state.get(key, None)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.has_state","title":"<code>has_state(key)</code>","text":"<p>Check is state has key</p> <p>Parameters:</p> Name Type Description Default <code>key</code> <code>str</code> <p>Key to check in the state dictionary</p> required <p>Returns:</p> Type Description <code>bool</code> <p>True if the key is in the state dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def has_state(self, key: str) -&gt; bool:\n    \"\"\"\n    Check is state has key\n\n    Parameters:\n        key:\n            Key to check in the state dictionary\n\n    Returns:\n        True if the key is in the state dictionary\n    \"\"\"\n    return key in self._state\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_results","title":"<code>reset_results()</code>","text":"<p>Reset the results dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def reset_results(self):\n    \"\"\"\n    Reset the results dictionary\n    \"\"\"\n    self._results = {}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_state","title":"<code>reset_state()</code>","text":"<p>Reset the state dictionary</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def reset_state(self):\n    \"\"\"\n    Reset the state dictionary\n    \"\"\"\n    self._state = {}\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.run_calculators","title":"<code>run_calculators()</code>","text":"<p>Run the saved calculators and save the results in the manager</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def run_calculators(self):\n    \"\"\"\n    Run the saved calculators and save the results in the manager\n    \"\"\"\n    logger.info(\"Processing dataset statistics\")\n    for calculator in self._statistic_calculators:\n        calculator.run(self.state)\n        self._results[calculator.__class__.__name__] = calculator.result\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults","title":"<code>StatisticsResults</code>","text":"<p>Parent class to statistics results to provide general methods.</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class StatisticsResults:\n    \"\"\"\n    Parent class to statistics results\n    to provide general methods.\n    \"\"\"\n\n    def to_dict(self) -&gt; Dict:\n        \"\"\"\n        Convert the class to a dictionary\n\n        Returns:\n            Dictionary representation of the class\n        \"\"\"\n        return asdict(self)\n\n    def transform(self, func: Callable):\n        \"\"\"\n        Apply a function to all the attributes of the class\n\n        Parameters:\n            func:\n                Function to apply to the attributes\n        \"\"\"\n        for k, v in self.to_dict().items():\n            if v is not None:\n                setattr(self, k, func(v))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.to_dict","title":"<code>to_dict()</code>","text":"<p>Convert the class to a dictionary</p> <p>Returns:</p> Type Description <code>Dict</code> <p>Dictionary representation of the class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def to_dict(self) -&gt; Dict:\n    \"\"\"\n    Convert the class to a dictionary\n\n    Returns:\n        Dictionary representation of the class\n    \"\"\"\n    return asdict(self)\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.transform","title":"<code>transform(func)</code>","text":"<p>Apply a function to all the attributes of the class</p> <p>Parameters:</p> Name Type Description Default <code>func</code> <code>Callable</code> <p>Function to apply to the attributes</p> required Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>def transform(self, func: Callable):\n    \"\"\"\n    Apply a function to all the attributes of the class\n\n    Parameters:\n        func:\n            Function to apply to the attributes\n    \"\"\"\n    for k, v in self.to_dict().items():\n        if v is not None:\n            setattr(self, k, func(v))\n</code></pre>"},{"location":"API/statistics.html#openqdc.datasets.statistics.TotalEnergyStats","title":"<code>TotalEnergyStats</code>","text":"<p>               Bases: <code>AbstractStatsCalculator</code></p> <p>Total Energy statistics calculator class</p> Source code in <code>openqdc/datasets/statistics.py</code> <pre><code>class TotalEnergyStats(AbstractStatsCalculator):\n    \"\"\"\n    Total Energy statistics calculator class\n    \"\"\"\n\n    def compute(self) -&gt; EnergyStatistics:\n        converted_energy_data = self.energies\n        total_E_mean = np.nanmean(converted_energy_data, axis=0)\n        total_E_std = np.nanstd(converted_energy_data, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))\n</code></pre>"},{"location":"API/units.html","title":"UNITS","text":"<p>Units conversion utilities module.</p> Available Energy units <p>[\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]</p> Available Distance units <p>[\"ang\", \"nm\", \"bohr\"]</p> Available Force units <p>Combinations between Energy and Distance units</p>"},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"<code>Conversion</code>","text":"<p>Conversion from one unit system to another defined by a name and a callable</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"<code>__init__(in_unit, out_unit, func)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>in_unit</code> <code>str</code> <p>String defining the units of the current values</p> required <code>out_unit</code> <code>str</code> <p>String defining the target units</p> required <code>func</code> <code>Callable[[float], float]</code> <p>The callable to compute the conversion</p> required Source code in <code>openqdc/utils/units.py</code> <pre><code>def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"<code>DistanceTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code>, <code>StrEnum</code></p> <p>Define the possible distance units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"<code>to(distance, fraction=False)</code>","text":"<p>Get the conversion function to convert the distance to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>distance</code> <code>DistanceTypeConversion</code> <p>distance unit to convert to</p> required <code>fraction</code> <code>bool</code> <p>whether it is distance^1 or distance^-1</p> <code>False</code> <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"<code>EnergyTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code>, <code>StrEnum</code></p> <p>Define the possible energy units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"<code>to(energy)</code>","text":"<p>Get the conversion function to convert the energy to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>energy</code> <code>EnergyTypeConversion</code> <p>energy unit to convert to</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>Callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, energy: \"EnergyTypeConversion\") -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"<code>ForceTypeConversion</code>","text":"<p>               Bases: <code>ConversionEnum</code></p> <p>Define the possible foce units for conversion</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>@unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -&gt; Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"<code>to(energy, distance)</code>","text":"<p>Get the conversion function to convert the force to the desired units.</p> <p>Parameters:</p> Name Type Description Default <code>energy</code> <code>EnergyTypeConversion</code> <p>energy unit to convert to</p> required <code>distance</code> <code>DistanceTypeConversion</code> <p>distance unit to convert to</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>callable to convert the distance to the desired units</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -&gt; Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n</code></pre>"},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"<code>get_conversion(in_unit, out_unit)</code>","text":"<p>Utility function to get the conversion function between two units.</p> <p>Parameters:</p> Name Type Description Default <code>in_unit</code> <p>The input unit</p> required <code>out_unit</code> <p>The output unit</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>The conversion function</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def get_conversion(in_unit: str, out_unit: str) -&gt; Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n</code></pre>"},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"<code>check_file(path)</code>","text":"<p>Checks if file present on local</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def check_file(path) -&gt; bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"<code>create_hdf5_file(hdf5_file_path)</code>","text":"<p>Creates hdf5 file with fsspec</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"<code>get_conversion(in_unit, out_unit)</code>","text":"<p>Utility function to get the conversion function between two units.</p> <p>Parameters:</p> Name Type Description Default <code>in_unit</code> <p>The input unit</p> required <code>out_unit</code> <p>The output unit</p> required <p>Returns:</p> Type Description <code>Callable[[float], float]</code> <p>The conversion function</p> Source code in <code>openqdc/utils/units.py</code> <pre><code>def get_conversion(in_unit: str, out_unit: str) -&gt; Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"<code>get_local_cache()</code>","text":"<p>Returns the local cache directory. It creates it if it does not exist.</p> <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>path to the local cache directory</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def get_local_cache() -&gt; str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"<code>get_remote_cache(write_access=False)</code>","text":"<p>Returns the entry point based on the write access.</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def get_remote_cache(write_access=False) -&gt; str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"<code>load_hdf5_file(hdf5_file_path)</code>","text":"<p>Loads hdf5 file with fsspec</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_json","title":"<code>load_json(path)</code>","text":"<p>Loads json file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"<code>load_pkl(path, check=True)</code>","text":"<p>Load pkl file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.makedirs","title":"<code>makedirs(path, exist_ok=True)</code>","text":"<p>Creates directory</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"<code>read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)</code>","text":"<p>Extracts data from the HDF5 archive file.</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -&gt; List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"<code>save_pkl(file, path)</code>","text":"<p>Saves pkl file</p> Source code in <code>openqdc/utils/io.py</code> <pre><code>def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n</code></pre>"},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"<code>set_cache_dir(d)</code>","text":"<p>Optionally set the _OPENQDC_CACHE_DIR directory.</p> <p>Parameters:</p> Name Type Description Default <code>d</code> <code>str</code> <p>path to a local folder.</p> required Source code in <code>openqdc/utils/io.py</code> <pre><code>def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n</code></pre>"},{"location":"API/datasets/3bpa.html","title":"3BPA","text":""},{"location":"API/datasets/3bpa.html#openqdc.datasets.potential.bpa.BPA","title":"<code>BPA</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike molecule 3-(benzyloxy)pyridin-2-amine. This dataset features complex dihedral potential energy surface with many local minima, which can be challenging to approximate using classical or ML force fields. The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. The final configurations were re-evaluated using ORCA at the DFT level of theory using the \u03c9B97X exchange correlation functional and the 6-31G(d) basis set.</p> <p>Usage: <pre><code>from openqdc.datasets import BPA\ndataset = BPA()\n</code></pre></p> References <p>https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647</p> Source code in <code>openqdc/datasets/potential/bpa.py</code> <pre><code>class BPA(BaseDataset):\n    \"\"\"\n    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike\n    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features\n    complex dihedral potential energy surface with many local minima,\n    which can be challenging to approximate using classical or ML force fields.\n    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to\n    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at\n    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.\n    The final configurations were re-evaluated using ORCA at the DFT level of\n    theory using the \u03c9B97X exchange correlation functional and the 6-31G(d) basis set.\n\n    Usage:\n    ```python\n    from openqdc.datasets import BPA\n    dataset = BPA()\n    ```\n\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647\n    \"\"\"\n\n    __name__ = \"BPA\"\n    __energy_unit__ = \"ev\"\n    __forces_unit__ = \"ev/ang\"\n    __distance_unit__ = \"ang\"\n    __force_mask__ = [True]\n    __energy_methods__ = [PotentialMethod.WB97X_6_31G_D]\n    __links__ = {\"BPA.zip\": \"https://figshare.com/ndownloader/files/31325990\"}\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        import os.path as osp\n        from glob import glob\n\n        from ase.io import iread\n\n        files = glob(osp.join(self.root, \"dataset_3BPA\", \"*.xyz\"))\n        files = [f for f in files if \"iso_atoms.xyz\" not in f]\n        all_records = []\n\n        for file in files:\n            subset = np.array([osp.basename(file).split(\".\")[0]])\n\n            for atoms in iread(file, format=\"extxyz\"):\n                all_records.append(read_bpa_record(subset, atoms))\n\n        return all_records\n\n    def __getitem__(self, idx):\n        data = super().__getitem__(idx)\n        data.__setattr__(\"split\", self._convert_array(self.data[\"split\"][idx]))\n        return data\n</code></pre>"},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"<code>Alchemy</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.</p> <p>Usage: <pre><code>from openqdc.datasets import Alchemy\ndataset = Alchemy()\n</code></pre></p> Reference <p>https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/</p> Source code in <code>openqdc/datasets/potential/alchemy.py</code> <pre><code>class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n</code></pre>"},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"<code>ANI1</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1\ndataset = ANI1()\n</code></pre></p> References <p>https://www.nature.com/articles/sdata2017193</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"<code>ANI1CCX</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n</code></pre></p> References <p>https://doi.org/10.1038/s41467-019-10827-4</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        return x.decode(\"ascii\")\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"<code>ANI1CCX_V2</code>","text":"<p>               Bases: <code>ANI1CCX</code></p> <p>ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n</code></pre></p> References <p>https://doi.org/10.1038/s41467-019-10827-4</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"<code>ANI1X</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI1X\ndataset = ANI1X()\n</code></pre></p> References <p>https://doi.org/10.1063/1.5023802</p> <p>https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n</code></pre>"},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"<code>ANI2X</code>","text":"<p>               Bases: <code>ANI1</code></p> <p>The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import ANI2X\ndataset = ANI2X()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets</p> Source code in <code>openqdc/datasets/potential/ani.py</code> <pre><code>class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"<code>COMP6</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.</p> Details of the benchmark sets are as follows <p>S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and</p> <p>mixed influence interactions.</p> <pre><code>ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n</code></pre> <p>dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.</p> <pre><code>GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n</code></pre> <p>The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.</p> <pre><code>GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n</code></pre> <p>and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.</p> <pre><code>Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n</code></pre> <p>Structures are optimized similar to GDB7to9.</p> <p>Usage: <pre><code>from openqdc.datasets import COMP6\ndataset = COMP6()\n</code></pre></p> References <p>https://aip.scitation.org/doi/abs/10.1063/1.5023802</p> <p>https://github.com/isayev/COMP6</p> <p>S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d</p> <p>GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/</p> <p>GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/</p> <p>DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h</p> Source code in <code>openqdc/datasets/potential/comp6.py</code> <pre><code>class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/comp6.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"<code>DES370K</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code>, <code>IDES</code></p> <p>DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.</p> <p>Usage: <pre><code>from openqdc.datasets import DES370K\ndataset = DES370K()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"<code>DES5M</code>","text":"<p>               Bases: <code>DES370K</code></p> <p>DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.</p> <p>Usage: <pre><code>from openqdc.datasets import DES5M\ndataset = DES5M()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"<code>DESS66</code>","text":"<p>               Bases: <code>DES370K</code></p> <p>DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.</p> <p>Usage: <pre><code>from openqdc.datasets import DESS66\ndataset = DESS66()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> <p>S66: https://pubs.acs.org/doi/10.1021/ct2002946</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n</code></pre>"},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"<code>DESS66x8</code>","text":"<p>               Bases: <code>DESS66</code></p> <p>DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.</p> <p>Usage: <pre><code>from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n</code></pre></p> Reference <p>https://www.nature.com/articles/s41597-021-00833-x</p> Source code in <code>openqdc/datasets/interaction/des.py</code> <pre><code>class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n</code></pre>"},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"<code>GDML</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.</p> The dataset consists of the following trajectories <p>Benzene: 627000 samples</p> <p>Uracil: 133000 samples</p> <p>Naptalene: 326000 samples</p> <p>Aspirin: 211000 samples</p> <p>Salicylic Acid: 320000 samples</p> <p>Malonaldehyde: 993000 samples</p> <p>Ethanol: 555000 samples</p> <p>Toluene: 100000 samples</p> <p>Usage: <pre><code>from openqdc.datasets import GDML\ndataset = GDML()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets</p> Source code in <code>openqdc/datasets/potential/gdml.py</code> <pre><code>class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/geom.html","title":"GEOM","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.</p> <p>Usage: <pre><code>from openqdc.datasets import GEOM\ndataset = GEOM()\n</code></pre></p> References <p>https://www.nature.com/articles/s41597-022-01288-4</p> <p>https://github.com/learningmatter-mit/geom</p> <p>CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d</p> Source code in <code>openqdc/datasets/potential/geom.py</code> <pre><code>class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n</code></pre>"},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"<code>ISO17</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.</p> <p>Usage: <pre><code>from openqdc.datasets import ISO17\ndataset = ISO17()\n</code></pre></p> References <p>https://arxiv.org/abs/1706.08566</p> <p>https://arxiv.org/abs/1609.08259</p> <p>https://www.nature.com/articles/sdata201422</p> <p>https://pubmed.ncbi.nlm.nih.gov/10062328/</p> <p>https://pubmed.ncbi.nlm.nih.gov/19257665/</p> Source code in <code>openqdc/datasets/potential/iso_17.py</code> <pre><code>class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/iso_17.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"<code>L7</code>","text":"<p>               Bases: <code>YamlDataset</code></p> <p>The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.</p> <p>Usage: <pre><code>from openqdc.datasets import L7\ndataset = L7()\n</code></pre></p> Reference <p>https://pubs.acs.org/doi/10.1021/ct400036b</p> Source code in <code>openqdc/datasets/interaction/l7.py</code> <pre><code>class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n</code></pre>"},{"location":"API/datasets/maceoff.html","title":"MaceOFF","text":""},{"location":"API/datasets/maceoff.html#openqdc.datasets.potential.maceoff.MACEOFF","title":"<code>MACEOFF</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>MACEOFF dataset core of the dataset consist in the Spice V1 dataset. 95% of the data are used for training and validation under the \"train\" split, and 5% for testing. The dataset uses the Spice level of theory \u03c9B97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. MACEOFF uses a subset of SPICE that contains the ten chemical elements H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular non-bonded interactions, MACEOFF dataset contains larger 50\u201390 atom molecules randomly selected from the QMugs dataset. MACEOFF contains a number of water clusters carved out of molecular dynamics simulations of liquid water, with sizes of up to 50 water molecules and part of the COMP6 tripeptide geometry dataset.</p> <p>Usage: <pre><code>from openqdc.datasets import MACEOFF\ndataset = MACEOFF()\n</code></pre></p> Species <p>[H, C, N, O, F, P, S, Cl, Br, I]</p> References <p>https://arxiv.org/pdf/2312.15211</p> <p>https://doi.org/10.17863/CAM.107498</p> Source code in <code>openqdc/datasets/potential/maceoff.py</code> <pre><code>class MACEOFF(BaseDataset):\n    \"\"\"\n    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.\n    95% of the data are used for training and validation under the \"train\" split,\n    and 5% for testing. The dataset uses the Spice level of theory\n    \u03c9B97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.\n    MACEOFF uses a subset of SPICE that contains the ten chemical elements\n    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.\n    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular\n    non-bonded interactions, MACEOFF dataset contains larger 50\u201390 atom molecules\n    randomly selected from the QMugs dataset.\n    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations\n    of liquid water, with sizes of up to 50 water molecules and part of the\n    COMP6 tripeptide geometry dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MACEOFF\n    dataset = MACEOFF()\n    ```\n\n    Species:\n        [H, C, N, O, F, P, S, Cl, Br, I]\n\n    References:\n        https://arxiv.org/pdf/2312.15211\\n\n        https://doi.org/10.17863/CAM.107498\n    \"\"\"\n\n    __name__ = \"maceoff\"\n\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n\n    energy_target_names = [\"dft_total_energy\"]\n    force_target_names = [\"dft_total_gradient\"]\n\n    __links__ = {\n        \"train_large_neut_no_bad_clean.tar.gz\": \"https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content\",  # noqa: E501\n        \"test_large_neut_all.tar.gz\": \"https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content\",  # noqa: E501\n    }\n\n    def read_raw_entries(self):\n        entries = []\n        for filename in self.__links__:\n            filename = filename.split(\".\")[0]\n            xyzpath = p_join(self.root, f\"{filename}.xyz\")\n            split = filename.split(\"_\")[0]\n            structure_iterator = parse_mace_xyz(xyzpath)\n            func = partial(build_data_object, split=split)\n            entries.extend(dm.utils.parallelized(func, structure_iterator))\n        return entries\n\n    def __getitem__(self, idx):\n        data = super().__getitem__(idx)\n        data.__setattr__(\"split\", self._convert_array(self.data[\"split\"][idx]))\n        return data\n</code></pre>"},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"<code>MD22</code>","text":"<p>               Bases: <code>RevMD17</code></p> <p>MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import MD22\ndataset = MD22()\n</code></pre></p> Reference <p>https://arxiv.org/abs/2209.14865</p> Source code in <code>openqdc/datasets/potential/md22.py</code> <pre><code>class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n</code></pre>"},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"<code>Metcalf</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code></p> <p>Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.</p> <p>Usage: <pre><code>from openqdc.datasets import Metcalf\ndataset = Metcalf()\n</code></pre></p> Reference <p>https://doi.org/10.1063/1.5142636</p> Source code in <code>openqdc/datasets/interaction/metcalf.py</code> <pre><code>class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n</code></pre>"},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"<code>Molecule3D</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.</p> <p>Usage: <pre><code>from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n</code></pre></p> References <p>https://arxiv.org/abs/2110.01717</p> <p>https://github.com/divelab/MoleculeX</p> Source code in <code>openqdc/datasets/potential/molecule3d.py</code> <pre><code>class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n</code></pre>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"<code>read_mol(mol, energy)</code>","text":"<p>Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies</p>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"<p>mol: Chem.rdchem.Mol     RDKit molecule energy: float     Energy of the molecule</p>"},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"<p>res: dict     Dictionary containing the following keys:     - name: np.ndarray of shape (N,) containing the smiles of the molecule     - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions     - energies: np.ndarray of shape (1,) containing the energy of the conformer     - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer     - subset: np.ndarray of shape (1) containing \"molecule3d\"</p> Source code in <code>openqdc/datasets/potential/molecule3d.py</code> <pre><code>def read_mol(mol: Chem.rdchem.Mol, energy: float) -&gt; Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n</code></pre>"},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"<code>MultixcQM9</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.</p> <p>Usage: <pre><code>from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n</code></pre></p> References <p>https://www.nature.com/articles/s41597-023-02690-2</p> <p>https://github.com/chemsurajit/largeDFTdata</p> <p>https://www.nature.com/articles/s41597-019-0121-7</p> Source code in <code>openqdc/datasets/potential/multixcqm9.py</code> <pre><code>class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n</code></pre>"},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"<code>NablaDFT</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.</p> <p>Usage: <pre><code>from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n</code></pre></p> References <p>https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D</p> <p>https://github.com/AIRI-Institute/nablaDFT</p> Source code in <code>openqdc/datasets/potential/nabladft.py</code> <pre><code>class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n</code></pre>"},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"<code>OrbnetDenali</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n</code></pre></p> References <p>https://arxiv.org/abs/2107.00299</p> <p>https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867</p> Source code in <code>openqdc/datasets/potential/orbnet_denali.py</code> <pre><code>class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n</code></pre>"},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"<code>PCQM_B3LYP</code>","text":"<p>               Bases: <code>PCQM_PM6</code></p> <p>PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.</p> <p>Usage: <pre><code>from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n</code></pre></p> References <p>https://arxiv.org/abs/2305.18454</p> Source code in <code>openqdc/datasets/potential/pcqm.py</code> <pre><code>class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n</code></pre>"},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"<code>PCQM_PM6</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.</p> <p>Usage: <pre><code>from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n</code></pre></p> References <p>https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740</p> Source code in <code>openqdc/datasets/potential/pcqm.py</code> <pre><code>class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) &gt; 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n</code></pre>"},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"<code>MDDataset</code>","text":"<p>               Bases: <code>ProteinFragments</code></p> <p>MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:</p> Subsets <p>Polyalanine:     All the polyalanine are sampled in gas phase. AceAla15Lys is     a polyalanine peptides capped with an N-terminal acetyl group     and a protonated lysine residue at the C-terminus,     Acela15nme is polyalanine peptide capped with an N-terminal acetyl group     and a C-terminal N-methyl amide group</p> <p>Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)</p> <p>Usage: <pre><code>from openqdc.datasets import MDDataset\ndataset = MDDataset()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.adn4397</p> Source code in <code>openqdc/datasets/potential/proteinfragments.py</code> <pre><code>class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n</code></pre>"},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"<code>ProteinFragments</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:</p> Top-down <p>Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.</p> Bottom-up <p>Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.</p> <p>Usage: <pre><code>from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n</code></pre></p> References <p>https://www.science.org/doi/10.1126/sciadv.adn4397</p> Source code in <code>openqdc/datasets/potential/proteinfragments.py</code> <pre><code>class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"<code>QM1B</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.</p> <p>Usage: <pre><code>from openqdc.datasets import QM1B\ndataset = QM1B()\n</code></pre></p> References <p>https://arxiv.org/pdf/2311.01135</p> <p>https://github.com/graphcore-research/qm1b-dataset/</p> Source code in <code>openqdc/datasets/potential/qm1b.py</code> <pre><code>class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n</code></pre>"},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"<code>QM1B_SMALL</code>","text":"<p>               Bases: <code>QM1B</code></p> <p>QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.</p> <p>Usage: <pre><code>from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qm1b.py</code> <pre><code>class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n</code></pre>"},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"<code>QM7X</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.</p> <p>Usage: <pre><code>from openqdc.datasets import QM7X\ndataset = QM7X()\n</code></pre></p> References <p>https://arxiv.org/abs/2006.15139</p> <p>https://zenodo.org/records/4288677</p> Source code in <code>openqdc/datasets/potential/qm7x.py</code> <pre><code>class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n</code></pre>"},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"<code>QM7X_V2</code>","text":"<p>               Bases: <code>QM7X</code></p> <p>QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qm7x.py</code> <pre><code>class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n</code></pre>"},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"<code>QMugs</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).</p> <p>Usage: <pre><code>from openqdc.datasets import QMugs\ndataset = QMugs()\n</code></pre></p> References <p>https://arxiv.org/abs/2107.00367</p> <p>https://www.nature.com/articles/s41597-022-01390-7#ethics</p> <p>https://www.research-collection.ethz.ch/handle/20.500.11850/482129</p> Source code in <code>openqdc/datasets/potential/qmugs.py</code> <pre><code>class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&amp;files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&amp;files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n</code></pre>"},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"<code>QMugs_V2</code>","text":"<p>               Bases: <code>QMugs</code></p> <p>QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.</p> <p>Usage: <pre><code>from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n</code></pre></p> Source code in <code>openqdc/datasets/potential/qmugs.py</code> <pre><code>class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n</code></pre>"},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"<code>QM7</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.</p> Chemical species <p>[C, N, O, S, H]</p> <p>Usage: <pre><code>from openqdc.datasets import QM7\ndataset = QM7()\n</code></pre></p> References <p>https://arxiv.org/pdf/1703.00564</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"<code>QM7b</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.</p> Chemical species <p>[C, N, O, S, Cl, H]</p> <p>Usage: <pre><code>from openqdc.datasets import QM7b\ndataset = QM7b()\n</code></pre></p> References <p>https://arxiv.org/pdf/1703.00564</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"<code>QM8</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.</p> <p>Usage: <pre><code>from openqdc.datasets import QM8\ndataset = QM8()\n</code></pre></p> References <p>https://arxiv.org/pdf/1504.01966</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"<code>QM9</code>","text":"<p>               Bases: <code>QMX</code></p> <p>QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.</p> <p>Usage: <pre><code>from openqdc.datasets import QM9\ndataset = QM9()\n</code></pre></p> Reference <p>https://www.nature.com/articles/sdata201422</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n</code></pre>"},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"<code>QMX</code>","text":"<p>               Bases: <code>ABC</code>, <code>BaseDataset</code></p> <p>QMX dataset base abstract class</p> Source code in <code>openqdc/datasets/potential/qmx.py</code> <pre><code>class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) &gt; 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n</code></pre>"},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"<code>RevMD17</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules:     Benzene: 627000 samples</p> <pre><code>Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n</code></pre> <p>Usage: <pre><code>from openqdc.datasets import RevMD17\ndataset = RevMD17()\n</code></pre></p> References <p>https://arxiv.org/abs/2007.09593</p> Source code in <code>openqdc/datasets/potential/revmd17.py</code> <pre><code>class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n</code></pre>"},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"<code>SN2RXN</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -&gt; CH3X +  Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.</p> <p>Usage: <pre><code>from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.9b00181</p> <p>https://zenodo.org/records/2605341</p> Source code in <code>openqdc/datasets/potential/sn2_rxn.py</code> <pre><code>class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -&gt; CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"<code>SolvatedPeptides</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n</code></pre></p> References <p>https://doi.org/10.1021/acs.jctc.9b00181</p> <p>https://zenodo.org/records/2605372</p> Source code in <code>openqdc/datasets/potential/solvated_peptides.py</code> <pre><code>class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n</code></pre>"},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"<code>__smiles_converter__(x)</code>","text":"<p>util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format</p> Source code in <code>openqdc/datasets/potential/solvated_peptides.py</code> <pre><code>def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n</code></pre>"},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"<code>Spice</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import Spice\ndataset = Spice()\n</code></pre></p> References <p>https://arxiv.org/abs/2209.10702</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"<code>SpiceV2</code>","text":"<p>               Bases: <code>Spice</code></p> <p>SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.</p> <p>Usage: <pre><code>from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n</code></pre></p> References <p>https://github.com/openmm/spice-dataset/releases/tag/2.0.0</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"<code>SpiceVL2</code>","text":"<p>               Bases: <code>SpiceV2</code></p> <p>SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.</p> <p>Usage: <pre><code>from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n</code></pre></p> References <p>https://github.com/openmm/spice-dataset/releases/tag/2.0.0</p> <p>https://github.com/openmm/spice-dataset</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n</code></pre>"},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"<code>read_record(r, obj)</code>","text":"<p>Read record from hdf5 file.     r : hdf5 record     obj : Spice class object used to grab subset and names</p> Source code in <code>openqdc/datasets/potential/spice.py</code> <pre><code>def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n</code></pre>"},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"<code>Splinter</code>","text":"<p>               Bases: <code>BaseInteractionDataset</code></p> <p>Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.</p> <p>Usage: <pre><code>from openqdc.datasets import Splinter\ndataset = Splinter()\n</code></pre></p> Reference <p>https://doi.org/10.1038/s41597-023-02443-1</p> Source code in <code>openqdc/datasets/interaction/splinter.py</code> <pre><code>class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -&gt; List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n</code></pre>"},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"<code>TMQM</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.</p> <p>Usage: <pre><code>from openqdc.datasets import TMQM\ndataset = TMQM()\n</code></pre></p> References <p>https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041</p> <p>https://github.com/bbskjelstad/tmqm</p> Source code in <code>openqdc/datasets/potential/tmqm.py</code> <pre><code>class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n</code></pre>"},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"<code>Transition1X</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.</p> <p>Usage: <pre><code>from openqdc.datasets import Transition1X\ndataset = Transition1X()\n</code></pre></p> <p>References: - https://www.nature.com/articles/s41597-022-01870-w</p> <ul> <li>https://gitlab.com/matschreiner/Transition1x</li> </ul> Source code in <code>openqdc/datasets/potential/transition1x.py</code> <pre><code>class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n</code></pre>"},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"<code>VQM24</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.</p> <p>Usage: <pre><code>from openqdc.datasets import VQM24\ndataset = VQM24()\n</code></pre></p> Reference <p>https://arxiv.org/abs/2405.05961</p> Source code in <code>openqdc/datasets/potential/vqm24.py</code> <pre><code>class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n</code></pre>"},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"<code>SCANWaterClusters</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.</p> Chemical Species <p>[H, O, Li, Na, K, F, Cl, Br]</p> <p>Usage: <pre><code>from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n</code></pre></p> References <p>https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec</p> <p>https://github.com/esoteric-ephemera/water_cluster_density_errors</p> Source code in <code>openqdc/datasets/potential/waterclusters.py</code> <pre><code>class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n</code></pre>"},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"<code>WaterClusters</code>","text":"<p>               Bases: <code>BaseDataset</code></p> <p>The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.</p> Chemical Species <p>[\"H\", \"O\"]</p> <p>Usage: <pre><code>from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n</code></pre></p> References <p>https://doi.org/10.1063/1.5128378</p> <p>https://sites.uw.edu/wdbase/database-of-water-clusters/</p> Source code in <code>openqdc/datasets/potential/waterclusters3_30.py</code> <pre><code>class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n</code></pre>"},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"<code>X40</code>","text":"<p>               Bases: <code>YamlDataset</code></p> <p>X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.</p> <p>Usage: <pre><code>from openqdc.datasets import X40\ndataset = X40()\n</code></pre></p> Reference <p>https://pubs.acs.org/doi/10.1021/ct300647k</p> Source code in <code>openqdc/datasets/interaction/x40.py</code> <pre><code>class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n</code></pre>"},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied! <pre>from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n</pre> from openqdc.datasets import Spice ds = Spice(     energy_unit=\"kcal/mol\",     distance_unit=\"ang\", )  <pre>2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype &lt;U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype &lt;U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n</pre> In\u00a0[39]: Copied! <pre>ds[0]\n</pre> ds[0] Out[39]: <pre>{'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}</pre> In\u00a0[40]: Copied! <pre>ds.get_ase_atoms(0)\n</pre> ds.get_ase_atoms(0) Out[40]: <pre>Atoms(symbols='C8NH18', pbc=False, initial_charges=...)</pre> In\u00a0[53]: Copied! <pre>ds.get_ase_atoms(0).info\n</pre> ds.get_ase_atoms(0).info Out[53]: <pre>{'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}</pre> In\u00a0[41]: Copied! <pre>for i in ds.as_iter():\n    print(i)\n    break\n</pre> for i in ds.as_iter():     print(i)     break <pre>{'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n</pre> In\u00a0[42]: Copied! <pre>for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n</pre> for i in ds.as_iter(atoms=True):     print(i)     break <pre>Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n</pre> In\u00a0[43]: Copied! <pre>from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n</pre> from openqdc.methods import QmMethod  # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]: <pre>{('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}</pre> In\u00a0[44]: Copied! <pre># Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n</pre> # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]: <pre>array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])</pre> In\u00a0[45]: Copied! <pre>import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n</pre> import matplotlib.pyplot as plt  from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])   <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01&lt;00:00, 459.21it/s]\n</pre> In\u00a0[46]: Copied! <pre>plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n</pre> plt.scatter(     embedding[:, 0],     embedding[:, 1],     c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar()  Out[46]: <pre>&lt;matplotlib.colorbar.Colorbar at 0x1554aa7bd820&gt;</pre>"},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"<p>If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.</p> <p>Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]</p> <p>Supported distance units: [\"ang\", \"nm\", \"bohr\"]</p>"},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"<p>The dictionary of the item contains different important keys:</p> <ul> <li>'positions' : numpy array of the 3d atomic positions (n x 3)</li> <li>'atomic_numbers': numpy array of the atomic numbers (n)</li> <li>'charges': numpy array of the formal charges for the molecule (n)</li> <li>'e0': isolated atom energy of the atoms in the molecule (n x n_level_of_theories)</li> <li>'energies': potential energy of the molecule (n_level_of_theries)</li> <li>'name': name or smiles (is present) of the molecule</li> <li>'subset': subset of the dataset the molecule belongs to</li> <li>'forces': if present, the forces on the atoms (n x 3 x n_level_of_theories_forces)</li> </ul>"},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"<p>The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.</p>"},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"<p>The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.</p> <p>$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$</p> <p>The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow</p>"},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"<p>openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.</p>"}]}
\ No newline at end of file
diff --git a/main/sitemap.xml b/main/sitemap.xml
index 9076776..d211d1b 100644
--- a/main/sitemap.xml
+++ b/main/sitemap.xml
@@ -90,6 +90,11 @@
          <lastmod>2024-08-30</lastmod>
          <changefreq>daily</changefreq>
     </url>
+    <url>
+         <loc>https://github.com/valence-labs/openQDC/main/API/datasets/3bpa.html</loc>
+         <lastmod>2024-08-30</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
     <url>
          <loc>https://github.com/valence-labs/openQDC/main/API/datasets/alchemy.html</loc>
          <lastmod>2024-08-30</lastmod>
@@ -130,6 +135,11 @@
          <lastmod>2024-08-30</lastmod>
          <changefreq>daily</changefreq>
     </url>
+    <url>
+         <loc>https://github.com/valence-labs/openQDC/main/API/datasets/maceoff.html</loc>
+         <lastmod>2024-08-30</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
     <url>
          <loc>https://github.com/valence-labs/openQDC/main/API/datasets/md22.html</loc>
          <lastmod>2024-08-30</lastmod>
diff --git a/main/sitemap.xml.gz b/main/sitemap.xml.gz
index 3f81275..c1330a6 100644
Binary files a/main/sitemap.xml.gz and b/main/sitemap.xml.gz differ
diff --git a/main/tutorials/usage.html b/main/tutorials/usage.html
index b1ede83..4bc3c46 100644
--- a/main/tutorials/usage.html
+++ b/main/tutorials/usage.html
@@ -1049,6 +1049,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1112,6 +1133,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../API/datasets/geom.html" class="md-nav__link">
         
diff --git a/main/usage.html b/main/usage.html
index ed38163..2b17c27 100644
--- a/main/usage.html
+++ b/main/usage.html
@@ -1016,6 +1016,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/3bpa.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    3BPA
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/alchemy.html" class="md-nav__link">
         
@@ -1079,6 +1100,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="API/datasets/maceoff.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    MaceOFF
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="API/datasets/geom.html" class="md-nav__link">