From 76bf7e9f1388fdf54b1ff39f8412c47dd839fffa Mon Sep 17 00:00:00 2001 From: Pranab Das <31024886+pranabdas@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:44:39 +0800 Subject: [PATCH 1/6] SOF-7123: bump github checkout/actions to v4 --- .github/workflows/cicd.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 66cfa19c..d767d926 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -12,12 +12,12 @@ jobs: steps: - name: Checkout this repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: lfs: true - name: Checkout actions repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: repository: Exabyte-io/actions token: ${{ secrets.BOT_GITHUB_TOKEN }} @@ -38,12 +38,12 @@ jobs: # # steps: # - name: Checkout this repository -# uses: actions/checkout@v2 +# uses: actions/checkout@v4 # with: # lfs: true # # - name: Checkout actions repository -# uses: actions/checkout@v2 +# uses: actions/checkout@v4 # with: # repository: Exabyte-io/actions # token: ${{ secrets.BOT_GITHUB_TOKEN }} @@ -66,12 +66,12 @@ jobs: steps: - name: Checkout this repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: lfs: true - name: Checkout actions repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: repository: Exabyte-io/actions token: ${{ secrets.BOT_GITHUB_TOKEN }} @@ -91,12 +91,12 @@ jobs: steps: - name: Checkout this repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: lfs: true - name: Checkout actions repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: repository: Exabyte-io/actions token: ${{ secrets.BOT_GITHUB_TOKEN }} From 5d23b02c658938fd5f0763c77120d66a18a68266 Mon Sep 17 00:00:00 2001 From: Pranab Das <31024886+pranabdas@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:56:17 +0800 Subject: [PATCH 2/6] SOF-7123: new pw_scf_dft_v template --- assets/espresso/pw_scf_dft_v.j2.in | 47 ++++++++++++++++++++++++++++++ executables/espresso/pw.x.yml | 20 +++++++++++++ src/js/data/templates.js | 2 +- src/js/data/tree.js | 2 +- templates/espresso/pw.x.yml | 10 +++++++ 5 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 assets/espresso/pw_scf_dft_v.j2.in diff --git a/assets/espresso/pw_scf_dft_v.j2.in b/assets/espresso/pw_scf_dft_v.j2.in new file mode 100644 index 00000000..557a4f9e --- /dev/null +++ b/assets/espresso/pw_scf_dft_v.j2.in @@ -0,0 +1,47 @@ +{% if subworkflowContext.MATERIAL_INDEX %} +{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%} +{% endif -%} +&CONTROL + calculation = 'scf' + title = '' + verbosity = 'low' + restart_mode = '{{ input.RESTART_MODE }}' + wf_collect = .true. + tstress = .true. + tprnfor = .true. + outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %} + wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %} + prefix = '__prefix__' + pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %} +/ +&SYSTEM + ibrav = {{ input.IBRAV }} + nat = {{ input.NAT }} + ntyp = {{ input.NTYP }} + ecutwfc = {{ cutoffs.wavefunction }} + ecutrho = {{ cutoffs.density }} + occupations = 'fixed' +/ +&ELECTRONS + diagonalization = 'david' + diago_david_ndim = 4 + diago_full_acc = .true. + mixing_beta = 0.3 + startingwfc = 'atomic+random' +/ +&IONS +/ +&CELL +/ +ATOMIC_SPECIES +{{ input.ATOMIC_SPECIES }} +ATOMIC_POSITIONS crystal +{{ input.ATOMIC_POSITIONS }} +CELL_PARAMETERS angstrom +{{ input.CELL_PARAMETERS }} +K_POINTS automatic +{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %} +HUBBARD {ortho-atomic} +{% for row in hubbard_v -%} +V {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }} +{% endfor -%} diff --git a/executables/espresso/pw.x.yml b/executables/espresso/pw.x.yml index 9e1148ad..82de8ac2 100644 --- a/executables/espresso/pw.x.yml +++ b/executables/espresso/pw.x.yml @@ -96,6 +96,26 @@ flavors: supportedApplicationVersions: - '7.2' + pw_scf_dft_v: + input: + - name: pw_scf_dft_v.in + results: + - atomic_forces + - band_gaps + - fermi_energy + - pressure + - stress_tensor + - total_energy + - total_energy_contributions + - total_force + monitors: + - standard_output + - convergence_electronic + applicationName: espresso + executableName: pw.x + supportedApplicationVersions: + - '7.2' + pw_scf_dft_u_legacy: input: - name: pw_scf_dft_u_legacy.in diff --git a/src/js/data/templates.js b/src/js/data/templates.js index 723a750c..56684885 100644 --- a/src/js/data/templates.js +++ b/src/js/data/templates.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {allTemplates: [{"content":"1\npp.dat\n1.0\n3000\n3\n3.0000\n","name":"average.in","contextProviders":[],"applicationName":"espresso","executableName":"average.x"},{"content":"&BANDS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n filband = {% raw %}'{{ JOB_WORK_DIR }}/bands.dat'{% endraw %}\n no_overlap = .true.\n/\n\n","name":"bands.in","contextProviders":[],"applicationName":"espresso","executableName":"bands.x"},{"content":"&DOS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n/\n\n","name":"dos.in","contextProviders":[],"applicationName":"espresso","executableName":"dos.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"dynmat_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"dynmat.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! truncation (used for both correlation and exchange)\n truncation = '2d'\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of the Coulomb solver\n thres_coul = 1.0d-2\n\n ! configuration of W in the convolution\n model_coul = 'godby-needs'\n max_freq_coul = 120\n num_freq_coul = 35\n\n ! configuration of the Green solver\n thres_green = 1.0d-3\n max_iter_green = 300\n\n ! configuration for the correlation self energy\n ecut_corr = 5.0\n max_freq_corr = 100.0\n num_freq_corr = 11\n\n ! configuration for the exchange self energy\n ecut_exch = 20.0\n\n ! configuration for the output\n eta = 0.1\n min_freq_wind = -30.0\n max_freq_wind = 30.0\n num_freq_wind = 601\n/\n\n&gw_output\n/\n\nFREQUENCIES\n2\n 0.0 0.0\n 0.0 10.0\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_plasmon_pole.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of W in the convolution\n max_freq_coul = 200\n num_freq_coul = 51\n\n ! configuration for the correlation self energy\n ecut_corr = 6.0\n\n ! configuration for the exchange self energy\n ecut_exch = 15.0\n/\n\n&gw_output\n/\n\nFREQUENCIES\n35\n 0.0 0.0\n 0.0 0.3\n 0.0 0.9\n 0.0 1.8\n 0.0 3.0\n 0.0 4.5\n 0.0 6.3\n 0.0 8.4\n 0.0 10.8\n 0.0 13.5\n 0.0 16.5\n 0.0 19.8\n 0.0 23.4\n 0.0 27.3\n 0.0 31.5\n 0.0 36.0\n 0.0 40.8\n 0.0 45.9\n 0.0 51.3\n 0.0 57.0\n 0.0 63.0\n 0.0 69.3\n 0.0 75.9\n 0.0 82.8\n 0.0 90.0\n 0.0 97.5\n 0.0 105.3\n 0.0 113.4\n 0.0 121.8\n 0.0 130.5\n 0.0 139.5\n 0.0 148.8\n 0.0 158.4\n 0.0 168.3\n 0.0 178.5\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_full_frequency.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"matdyn_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc ='force_constants.fc'\n flfrq ='frequencies.freq'\n flvec ='normal_modes.out'\n q_in_band_form = .true.\n /\n{{ipath.length}}\n{% for point in ipath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"matdyn_path.in","contextProviders":[{"name":"IPathFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"BEGIN\nBEGIN_PATH_INPUT\n&PATH\n restart_mode = 'from_scratch'\n string_method = 'neb',\n nstep_path = 50,\n ds = 2.D0,\n opt_scheme = \"broyden\",\n num_of_images = {{ 2 + (input.INTERMEDIATE_IMAGES.length || neb.nImages) }},\n k_max = 0.3D0,\n k_min = 0.2D0,\n CI_scheme = \"auto\",\n path_thr = 0.1D0,\n/\nEND_PATH_INPUT\nBEGIN_ENGINE_INPUT\n&CONTROL\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.03\n nspin = 2\n starting_magnetization = 0.5\n/\n&ELECTRONS\n conv_thr = 1.D-8\n mixing_beta = 0.3\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nBEGIN_POSITIONS\nFIRST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.FIRST_IMAGE }}\n{%- for IMAGE in input.INTERMEDIATE_IMAGES %}\nINTERMEDIATE_IMAGE\nATOMIC_POSITIONS crystal\n{{ IMAGE }}\n{%- endfor %}\nLAST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.LAST_IMAGE }}\nEND_POSITIONS\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nEND_ENGINE_INPUT\nEND\n","name":"neb.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"NEBFormDataManager"},{"name":"QENEBInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"neb.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n ldisp = .true.\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n{% for point in qpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor %}\n{% endfor %}\n","name":"ph_path.in","contextProviders":[{"name":"QPathFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n0 0 0\n","name":"ph_gamma.in","contextProviders":[],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .false.\n start_irr = 0\n last_irr = 0\n ldisp = .true.\n fildyn = 'dyn0'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_init_qpoints.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .true.\n ldisp = .true.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid_restart.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18\n ldisp = .true.\n {% raw -%}\n start_q = {{MAP_DATA.qpoint}}\n last_q = {{MAP_DATA.qpoint}}\n start_irr = {{MAP_DATA.irr}}\n last_irr= {{MAP_DATA.irr}}\n {%- endraw %}\n recover = .true.\n fildyn = 'dyn'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_SCRATCH_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_single_irr_qpt.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 0\n/\n&PLOT\n iflag = 3\n output_format = 5\n fileout ='density.xsf'\n/\n\n","name":"pp_density.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 11\n/\n","name":"pp_electrostatic_potential.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&PROJWFC\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n deltaE = 0.05\n/\n","name":"projwfc.in","contextProviders":[],"applicationName":"espresso","executableName":"projwfc.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n input_dft = 'hse',\n {% for d in qgrid.dimensions -%}\n nqx{{loop.index}} = {{d}}\n {% endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal\n{{ '{{' }} {{ explicitKPath.length }} {% raw %} + KPOINTS|length }} {% endraw %}\n{%- raw %}\n{% for point in KPOINTS -%}\n {% for d in point.coordinates %}{{ \"%14.9f\"|format(d) }} {% endfor -%}{{ point.weight }}\n{% endfor %}\n{% endraw -%}\n{% for point in explicitKPath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}0.0000001\n{% endfor %}\n","name":"pw_scf_bands_hse.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPathFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% raw %}{{PARAMETER | default('1')}} {{PARAMETER | default('1')}} {{PARAMETER | default('1')}} 0 0 0{% endraw %}\n","name":"pw_scf_kpt_conv.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'nscf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n{%- if subworkflowContext.NO_SYMMETRY_NO_INVERSION %}\n nosym = .true.\n noinv = .true.\n{%- endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_nscf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n nstep = 50\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'vc-relax'\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_vc_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'bands'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal_b\n{{kpath.length}}\n{% for point in kpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"pw_bands.in","contextProviders":[{"name":"KPathFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n ecutfock = 100\n occupations = 'smearing'\n degauss = 0.005\n input_dft='hse',\n nqx1 = {% if kgrid.dimensions[0]%2 == 0 %}{{kgrid.dimensions[0]/2}}{% else %}{{(kgrid.dimensions[0]+1)/2}}{% endif %}, nqx2 = {% if kgrid.dimensions[1]%2 == 0 %}{{kgrid.dimensions[1]/2}}{% else %}{{(kgrid.dimensions[1]+1)/2}}{% endif %}, nqx3 = {% if kgrid.dimensions[2]%2 == 0 %}{{kgrid.dimensions[2]/2}}{% else %}{{(kgrid.dimensions[2]+1)/2}}{% endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{% if d%2 == 0 %}{{d}} {% else %}{{d+1}} {% endif %}{% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_hse.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n","name":"pw_scf_dft_u.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n lda_plus_u = .true.\n lda_plus_u_kind = 0\n U_projection_type = 'ortho-atomic'\n {%- for row in hubbard_legacy %}\n Hubbard_U({{ row.atomicSpeciesIndex }}) = {{ row.hubbardUValue }}\n {%- endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_dft_u_legacy.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManagerLegacy"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&INPUT\n fildyn = 'dyn'\n zasr = 'simple'\n flfrc = 'force_constants.fc'\n/\n","name":"q2r.in","contextProviders":[],"applicationName":"espresso","executableName":"q2r.x"},{"content":"&inputhp\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {%- for d in qgrid.dimensions %}\n nq{{ loop.index }} = {{ d }}\n {%- endfor %}\n/\n","name":"hp.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"hp.x"},{"content":"&inputpp\n calculation = \"eps\"\n prefix = \"__prefix__\"\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n\n&energy_grid\n smeartype = \"gauss\"\n intersmear = 0.2\n intrasmear = 0.0\n wmin = 0.0\n wmax = 30.0\n nw = 500\n shift = 0.0\n/\n\n","name":"epsilon.in","contextProviders":[],"applicationName":"espresso","executableName":"epsilon.x"},{"content":"# ------------------------------------------------------------------------------- #\n# #\n# Example JupyterLab requirements #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. A Python virtual environment is created #\n# - in /scratch/$USER/$JOB_ID (for build: 'Default') #\n# - in /export/share/python/ (for build: 'with-pre-installed-packages') #\n# 3. This list is used to populate a Python virtual environment #\n# 4. JupyterLab is started #\n# #\n# For more information visit: #\n# - https://jupyterlab.readthedocs.io/en/stable/index.html #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Note: With the JupyterLab build 'with-pre-installed-packages', packages #\n# cannot be added during the notebook runtime. #\n# #\n# ------------------------------------------------------------------------------- #\n\njupyterlab==3.0.3\nnotebook>=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR <=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR < Date: Fri, 1 Dec 2023 17:01:17 +0800 Subject: [PATCH 3/6] SOF-7123: rename HubbardContextProvider -> HubbardUContextProvider + also rename context name hubbard -> hubbard_u --- assets/espresso/pw_scf_dft_u.j2.in | 2 +- src/js/data/templates.js | 2 +- templates/espresso/pw.x.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/espresso/pw_scf_dft_u.j2.in b/assets/espresso/pw_scf_dft_u.j2.in index 6c01cbfa..4d729e3a 100644 --- a/assets/espresso/pw_scf_dft_u.j2.in +++ b/assets/espresso/pw_scf_dft_u.j2.in @@ -42,6 +42,6 @@ CELL_PARAMETERS angstrom K_POINTS automatic {% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %} HUBBARD {ortho-atomic} -{% for row in hubbard -%} +{% for row in hubbard_u -%} U {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }} {% endfor -%} diff --git a/src/js/data/templates.js b/src/js/data/templates.js index 56684885..a156f8e9 100644 --- a/src/js/data/templates.js +++ b/src/js/data/templates.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {allTemplates: [{"content":"1\npp.dat\n1.0\n3000\n3\n3.0000\n","name":"average.in","contextProviders":[],"applicationName":"espresso","executableName":"average.x"},{"content":"&BANDS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n filband = {% raw %}'{{ JOB_WORK_DIR }}/bands.dat'{% endraw %}\n no_overlap = .true.\n/\n\n","name":"bands.in","contextProviders":[],"applicationName":"espresso","executableName":"bands.x"},{"content":"&DOS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n/\n\n","name":"dos.in","contextProviders":[],"applicationName":"espresso","executableName":"dos.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"dynmat_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"dynmat.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! truncation (used for both correlation and exchange)\n truncation = '2d'\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of the Coulomb solver\n thres_coul = 1.0d-2\n\n ! configuration of W in the convolution\n model_coul = 'godby-needs'\n max_freq_coul = 120\n num_freq_coul = 35\n\n ! configuration of the Green solver\n thres_green = 1.0d-3\n max_iter_green = 300\n\n ! configuration for the correlation self energy\n ecut_corr = 5.0\n max_freq_corr = 100.0\n num_freq_corr = 11\n\n ! configuration for the exchange self energy\n ecut_exch = 20.0\n\n ! configuration for the output\n eta = 0.1\n min_freq_wind = -30.0\n max_freq_wind = 30.0\n num_freq_wind = 601\n/\n\n&gw_output\n/\n\nFREQUENCIES\n2\n 0.0 0.0\n 0.0 10.0\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_plasmon_pole.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of W in the convolution\n max_freq_coul = 200\n num_freq_coul = 51\n\n ! configuration for the correlation self energy\n ecut_corr = 6.0\n\n ! configuration for the exchange self energy\n ecut_exch = 15.0\n/\n\n&gw_output\n/\n\nFREQUENCIES\n35\n 0.0 0.0\n 0.0 0.3\n 0.0 0.9\n 0.0 1.8\n 0.0 3.0\n 0.0 4.5\n 0.0 6.3\n 0.0 8.4\n 0.0 10.8\n 0.0 13.5\n 0.0 16.5\n 0.0 19.8\n 0.0 23.4\n 0.0 27.3\n 0.0 31.5\n 0.0 36.0\n 0.0 40.8\n 0.0 45.9\n 0.0 51.3\n 0.0 57.0\n 0.0 63.0\n 0.0 69.3\n 0.0 75.9\n 0.0 82.8\n 0.0 90.0\n 0.0 97.5\n 0.0 105.3\n 0.0 113.4\n 0.0 121.8\n 0.0 130.5\n 0.0 139.5\n 0.0 148.8\n 0.0 158.4\n 0.0 168.3\n 0.0 178.5\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_full_frequency.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"matdyn_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc ='force_constants.fc'\n flfrq ='frequencies.freq'\n flvec ='normal_modes.out'\n q_in_band_form = .true.\n /\n{{ipath.length}}\n{% for point in ipath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"matdyn_path.in","contextProviders":[{"name":"IPathFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"BEGIN\nBEGIN_PATH_INPUT\n&PATH\n restart_mode = 'from_scratch'\n string_method = 'neb',\n nstep_path = 50,\n ds = 2.D0,\n opt_scheme = \"broyden\",\n num_of_images = {{ 2 + (input.INTERMEDIATE_IMAGES.length || neb.nImages) }},\n k_max = 0.3D0,\n k_min = 0.2D0,\n CI_scheme = \"auto\",\n path_thr = 0.1D0,\n/\nEND_PATH_INPUT\nBEGIN_ENGINE_INPUT\n&CONTROL\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.03\n nspin = 2\n starting_magnetization = 0.5\n/\n&ELECTRONS\n conv_thr = 1.D-8\n mixing_beta = 0.3\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nBEGIN_POSITIONS\nFIRST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.FIRST_IMAGE }}\n{%- for IMAGE in input.INTERMEDIATE_IMAGES %}\nINTERMEDIATE_IMAGE\nATOMIC_POSITIONS crystal\n{{ IMAGE }}\n{%- endfor %}\nLAST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.LAST_IMAGE }}\nEND_POSITIONS\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nEND_ENGINE_INPUT\nEND\n","name":"neb.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"NEBFormDataManager"},{"name":"QENEBInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"neb.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n ldisp = .true.\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n{% for point in qpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor %}\n{% endfor %}\n","name":"ph_path.in","contextProviders":[{"name":"QPathFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n0 0 0\n","name":"ph_gamma.in","contextProviders":[],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .false.\n start_irr = 0\n last_irr = 0\n ldisp = .true.\n fildyn = 'dyn0'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_init_qpoints.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .true.\n ldisp = .true.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid_restart.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18\n ldisp = .true.\n {% raw -%}\n start_q = {{MAP_DATA.qpoint}}\n last_q = {{MAP_DATA.qpoint}}\n start_irr = {{MAP_DATA.irr}}\n last_irr= {{MAP_DATA.irr}}\n {%- endraw %}\n recover = .true.\n fildyn = 'dyn'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_SCRATCH_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_single_irr_qpt.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 0\n/\n&PLOT\n iflag = 3\n output_format = 5\n fileout ='density.xsf'\n/\n\n","name":"pp_density.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 11\n/\n","name":"pp_electrostatic_potential.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&PROJWFC\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n deltaE = 0.05\n/\n","name":"projwfc.in","contextProviders":[],"applicationName":"espresso","executableName":"projwfc.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n input_dft = 'hse',\n {% for d in qgrid.dimensions -%}\n nqx{{loop.index}} = {{d}}\n {% endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal\n{{ '{{' }} {{ explicitKPath.length }} {% raw %} + KPOINTS|length }} {% endraw %}\n{%- raw %}\n{% for point in KPOINTS -%}\n {% for d in point.coordinates %}{{ \"%14.9f\"|format(d) }} {% endfor -%}{{ point.weight }}\n{% endfor %}\n{% endraw -%}\n{% for point in explicitKPath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}0.0000001\n{% endfor %}\n","name":"pw_scf_bands_hse.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPathFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% raw %}{{PARAMETER | default('1')}} {{PARAMETER | default('1')}} {{PARAMETER | default('1')}} 0 0 0{% endraw %}\n","name":"pw_scf_kpt_conv.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'nscf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n{%- if subworkflowContext.NO_SYMMETRY_NO_INVERSION %}\n nosym = .true.\n noinv = .true.\n{%- endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_nscf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n nstep = 50\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'vc-relax'\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_vc_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'bands'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal_b\n{{kpath.length}}\n{% for point in kpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"pw_bands.in","contextProviders":[{"name":"KPathFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n ecutfock = 100\n occupations = 'smearing'\n degauss = 0.005\n input_dft='hse',\n nqx1 = {% if kgrid.dimensions[0]%2 == 0 %}{{kgrid.dimensions[0]/2}}{% else %}{{(kgrid.dimensions[0]+1)/2}}{% endif %}, nqx2 = {% if kgrid.dimensions[1]%2 == 0 %}{{kgrid.dimensions[1]/2}}{% else %}{{(kgrid.dimensions[1]+1)/2}}{% endif %}, nqx3 = {% if kgrid.dimensions[2]%2 == 0 %}{{kgrid.dimensions[2]/2}}{% else %}{{(kgrid.dimensions[2]+1)/2}}{% endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{% if d%2 == 0 %}{{d}} {% else %}{{d+1}} {% endif %}{% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_hse.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n","name":"pw_scf_dft_u.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_v -%}\nV {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }}\n{% endfor -%}\n","name":"pw_scf_dft_v.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardVContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n lda_plus_u = .true.\n lda_plus_u_kind = 0\n U_projection_type = 'ortho-atomic'\n {%- for row in hubbard_legacy %}\n Hubbard_U({{ row.atomicSpeciesIndex }}) = {{ row.hubbardUValue }}\n {%- endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_dft_u_legacy.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManagerLegacy"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&INPUT\n fildyn = 'dyn'\n zasr = 'simple'\n flfrc = 'force_constants.fc'\n/\n","name":"q2r.in","contextProviders":[],"applicationName":"espresso","executableName":"q2r.x"},{"content":"&inputhp\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {%- for d in qgrid.dimensions %}\n nq{{ loop.index }} = {{ d }}\n {%- endfor %}\n/\n","name":"hp.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"hp.x"},{"content":"&inputpp\n calculation = \"eps\"\n prefix = \"__prefix__\"\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n\n&energy_grid\n smeartype = \"gauss\"\n intersmear = 0.2\n intrasmear = 0.0\n wmin = 0.0\n wmax = 30.0\n nw = 500\n shift = 0.0\n/\n\n","name":"epsilon.in","contextProviders":[],"applicationName":"espresso","executableName":"epsilon.x"},{"content":"# ------------------------------------------------------------------------------- #\n# #\n# Example JupyterLab requirements #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. A Python virtual environment is created #\n# - in /scratch/$USER/$JOB_ID (for build: 'Default') #\n# - in /export/share/python/ (for build: 'with-pre-installed-packages') #\n# 3. This list is used to populate a Python virtual environment #\n# 4. JupyterLab is started #\n# #\n# For more information visit: #\n# - https://jupyterlab.readthedocs.io/en/stable/index.html #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Note: With the JupyterLab build 'with-pre-installed-packages', packages #\n# cannot be added during the notebook runtime. #\n# #\n# ------------------------------------------------------------------------------- #\n\njupyterlab==3.0.3\nnotebook>=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR <=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR < Date: Sat, 2 Dec 2023 09:40:50 +0800 Subject: [PATCH 4/6] SOF-7123: new template for DFT+U+J calculation --- assets/espresso/pw_scf_dft_j.j2.in | 47 ++++++++++++++++++++++++++++++ executables/espresso/pw.x.yml | 20 +++++++++++++ src/js/data/templates.js | 2 +- src/js/data/tree.js | 2 +- templates/espresso/pw.x.yml | 10 +++++++ 5 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 assets/espresso/pw_scf_dft_j.j2.in diff --git a/assets/espresso/pw_scf_dft_j.j2.in b/assets/espresso/pw_scf_dft_j.j2.in new file mode 100644 index 00000000..388e38c1 --- /dev/null +++ b/assets/espresso/pw_scf_dft_j.j2.in @@ -0,0 +1,47 @@ +{% if subworkflowContext.MATERIAL_INDEX %} +{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%} +{% endif -%} +&CONTROL + calculation = 'scf' + title = '' + verbosity = 'low' + restart_mode = '{{ input.RESTART_MODE }}' + wf_collect = .true. + tstress = .true. + tprnfor = .true. + outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %} + wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %} + prefix = '__prefix__' + pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %} +/ +&SYSTEM + ibrav = {{ input.IBRAV }} + nat = {{ input.NAT }} + ntyp = {{ input.NTYP }} + ecutwfc = {{ cutoffs.wavefunction }} + ecutrho = {{ cutoffs.density }} + occupations = 'fixed' +/ +&ELECTRONS + diagonalization = 'david' + diago_david_ndim = 4 + diago_full_acc = .true. + mixing_beta = 0.3 + startingwfc = 'atomic+random' +/ +&IONS +/ +&CELL +/ +ATOMIC_SPECIES +{{ input.ATOMIC_SPECIES }} +ATOMIC_POSITIONS crystal +{{ input.ATOMIC_POSITIONS }} +CELL_PARAMETERS angstrom +{{ input.CELL_PARAMETERS }} +K_POINTS automatic +{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %} +HUBBARD {ortho-atomic} +{% for row in hubbard_j -%} +{{ row.paramType }} {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.value }} +{% endfor -%} diff --git a/executables/espresso/pw.x.yml b/executables/espresso/pw.x.yml index 82de8ac2..4f3b37fb 100644 --- a/executables/espresso/pw.x.yml +++ b/executables/espresso/pw.x.yml @@ -116,6 +116,26 @@ flavors: supportedApplicationVersions: - '7.2' + pw_scf_dft_j: + input: + - name: pw_scf_dft_j.in + results: + - atomic_forces + - band_gaps + - fermi_energy + - pressure + - stress_tensor + - total_energy + - total_energy_contributions + - total_force + monitors: + - standard_output + - convergence_electronic + applicationName: espresso + executableName: pw.x + supportedApplicationVersions: + - '7.2' + pw_scf_dft_u_legacy: input: - name: pw_scf_dft_u_legacy.in diff --git a/src/js/data/templates.js b/src/js/data/templates.js index a156f8e9..8be783ed 100644 --- a/src/js/data/templates.js +++ b/src/js/data/templates.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {allTemplates: [{"content":"1\npp.dat\n1.0\n3000\n3\n3.0000\n","name":"average.in","contextProviders":[],"applicationName":"espresso","executableName":"average.x"},{"content":"&BANDS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n filband = {% raw %}'{{ JOB_WORK_DIR }}/bands.dat'{% endraw %}\n no_overlap = .true.\n/\n\n","name":"bands.in","contextProviders":[],"applicationName":"espresso","executableName":"bands.x"},{"content":"&DOS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n/\n\n","name":"dos.in","contextProviders":[],"applicationName":"espresso","executableName":"dos.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"dynmat_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"dynmat.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! truncation (used for both correlation and exchange)\n truncation = '2d'\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of the Coulomb solver\n thres_coul = 1.0d-2\n\n ! configuration of W in the convolution\n model_coul = 'godby-needs'\n max_freq_coul = 120\n num_freq_coul = 35\n\n ! configuration of the Green solver\n thres_green = 1.0d-3\n max_iter_green = 300\n\n ! configuration for the correlation self energy\n ecut_corr = 5.0\n max_freq_corr = 100.0\n num_freq_corr = 11\n\n ! configuration for the exchange self energy\n ecut_exch = 20.0\n\n ! configuration for the output\n eta = 0.1\n min_freq_wind = -30.0\n max_freq_wind = 30.0\n num_freq_wind = 601\n/\n\n&gw_output\n/\n\nFREQUENCIES\n2\n 0.0 0.0\n 0.0 10.0\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_plasmon_pole.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of W in the convolution\n max_freq_coul = 200\n num_freq_coul = 51\n\n ! configuration for the correlation self energy\n ecut_corr = 6.0\n\n ! configuration for the exchange self energy\n ecut_exch = 15.0\n/\n\n&gw_output\n/\n\nFREQUENCIES\n35\n 0.0 0.0\n 0.0 0.3\n 0.0 0.9\n 0.0 1.8\n 0.0 3.0\n 0.0 4.5\n 0.0 6.3\n 0.0 8.4\n 0.0 10.8\n 0.0 13.5\n 0.0 16.5\n 0.0 19.8\n 0.0 23.4\n 0.0 27.3\n 0.0 31.5\n 0.0 36.0\n 0.0 40.8\n 0.0 45.9\n 0.0 51.3\n 0.0 57.0\n 0.0 63.0\n 0.0 69.3\n 0.0 75.9\n 0.0 82.8\n 0.0 90.0\n 0.0 97.5\n 0.0 105.3\n 0.0 113.4\n 0.0 121.8\n 0.0 130.5\n 0.0 139.5\n 0.0 148.8\n 0.0 158.4\n 0.0 168.3\n 0.0 178.5\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_full_frequency.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"matdyn_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc ='force_constants.fc'\n flfrq ='frequencies.freq'\n flvec ='normal_modes.out'\n q_in_band_form = .true.\n /\n{{ipath.length}}\n{% for point in ipath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"matdyn_path.in","contextProviders":[{"name":"IPathFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"BEGIN\nBEGIN_PATH_INPUT\n&PATH\n restart_mode = 'from_scratch'\n string_method = 'neb',\n nstep_path = 50,\n ds = 2.D0,\n opt_scheme = \"broyden\",\n num_of_images = {{ 2 + (input.INTERMEDIATE_IMAGES.length || neb.nImages) }},\n k_max = 0.3D0,\n k_min = 0.2D0,\n CI_scheme = \"auto\",\n path_thr = 0.1D0,\n/\nEND_PATH_INPUT\nBEGIN_ENGINE_INPUT\n&CONTROL\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.03\n nspin = 2\n starting_magnetization = 0.5\n/\n&ELECTRONS\n conv_thr = 1.D-8\n mixing_beta = 0.3\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nBEGIN_POSITIONS\nFIRST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.FIRST_IMAGE }}\n{%- for IMAGE in input.INTERMEDIATE_IMAGES %}\nINTERMEDIATE_IMAGE\nATOMIC_POSITIONS crystal\n{{ IMAGE }}\n{%- endfor %}\nLAST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.LAST_IMAGE }}\nEND_POSITIONS\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nEND_ENGINE_INPUT\nEND\n","name":"neb.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"NEBFormDataManager"},{"name":"QENEBInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"neb.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n ldisp = .true.\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n{% for point in qpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor %}\n{% endfor %}\n","name":"ph_path.in","contextProviders":[{"name":"QPathFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n0 0 0\n","name":"ph_gamma.in","contextProviders":[],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .false.\n start_irr = 0\n last_irr = 0\n ldisp = .true.\n fildyn = 'dyn0'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_init_qpoints.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .true.\n ldisp = .true.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid_restart.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18\n ldisp = .true.\n {% raw -%}\n start_q = {{MAP_DATA.qpoint}}\n last_q = {{MAP_DATA.qpoint}}\n start_irr = {{MAP_DATA.irr}}\n last_irr= {{MAP_DATA.irr}}\n {%- endraw %}\n recover = .true.\n fildyn = 'dyn'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_SCRATCH_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_single_irr_qpt.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 0\n/\n&PLOT\n iflag = 3\n output_format = 5\n fileout ='density.xsf'\n/\n\n","name":"pp_density.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 11\n/\n","name":"pp_electrostatic_potential.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&PROJWFC\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n deltaE = 0.05\n/\n","name":"projwfc.in","contextProviders":[],"applicationName":"espresso","executableName":"projwfc.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n input_dft = 'hse',\n {% for d in qgrid.dimensions -%}\n nqx{{loop.index}} = {{d}}\n {% endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal\n{{ '{{' }} {{ explicitKPath.length }} {% raw %} + KPOINTS|length }} {% endraw %}\n{%- raw %}\n{% for point in KPOINTS -%}\n {% for d in point.coordinates %}{{ \"%14.9f\"|format(d) }} {% endfor -%}{{ point.weight }}\n{% endfor %}\n{% endraw -%}\n{% for point in explicitKPath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}0.0000001\n{% endfor %}\n","name":"pw_scf_bands_hse.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPathFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% raw %}{{PARAMETER | default('1')}} {{PARAMETER | default('1')}} {{PARAMETER | default('1')}} 0 0 0{% endraw %}\n","name":"pw_scf_kpt_conv.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'nscf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n{%- if subworkflowContext.NO_SYMMETRY_NO_INVERSION %}\n nosym = .true.\n noinv = .true.\n{%- endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_nscf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n nstep = 50\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'vc-relax'\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_vc_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'bands'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal_b\n{{kpath.length}}\n{% for point in kpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"pw_bands.in","contextProviders":[{"name":"KPathFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n ecutfock = 100\n occupations = 'smearing'\n degauss = 0.005\n input_dft='hse',\n nqx1 = {% if kgrid.dimensions[0]%2 == 0 %}{{kgrid.dimensions[0]/2}}{% else %}{{(kgrid.dimensions[0]+1)/2}}{% endif %}, nqx2 = {% if kgrid.dimensions[1]%2 == 0 %}{{kgrid.dimensions[1]/2}}{% else %}{{(kgrid.dimensions[1]+1)/2}}{% endif %}, nqx3 = {% if kgrid.dimensions[2]%2 == 0 %}{{kgrid.dimensions[2]/2}}{% else %}{{(kgrid.dimensions[2]+1)/2}}{% endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{% if d%2 == 0 %}{{d}} {% else %}{{d+1}} {% endif %}{% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_hse.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_u -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n","name":"pw_scf_dft_u.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardUContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_v -%}\nV {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }}\n{% endfor -%}\n","name":"pw_scf_dft_v.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardVContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n lda_plus_u = .true.\n lda_plus_u_kind = 0\n U_projection_type = 'ortho-atomic'\n {%- for row in hubbard_legacy %}\n Hubbard_U({{ row.atomicSpeciesIndex }}) = {{ row.hubbardUValue }}\n {%- endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_dft_u_legacy.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManagerLegacy"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&INPUT\n fildyn = 'dyn'\n zasr = 'simple'\n flfrc = 'force_constants.fc'\n/\n","name":"q2r.in","contextProviders":[],"applicationName":"espresso","executableName":"q2r.x"},{"content":"&inputhp\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {%- for d in qgrid.dimensions %}\n nq{{ loop.index }} = {{ d }}\n {%- endfor %}\n/\n","name":"hp.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"hp.x"},{"content":"&inputpp\n calculation = \"eps\"\n prefix = \"__prefix__\"\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n\n&energy_grid\n smeartype = \"gauss\"\n intersmear = 0.2\n intrasmear = 0.0\n wmin = 0.0\n wmax = 30.0\n nw = 500\n shift = 0.0\n/\n\n","name":"epsilon.in","contextProviders":[],"applicationName":"espresso","executableName":"epsilon.x"},{"content":"# ------------------------------------------------------------------------------- #\n# #\n# Example JupyterLab requirements #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. A Python virtual environment is created #\n# - in /scratch/$USER/$JOB_ID (for build: 'Default') #\n# - in /export/share/python/ (for build: 'with-pre-installed-packages') #\n# 3. This list is used to populate a Python virtual environment #\n# 4. JupyterLab is started #\n# #\n# For more information visit: #\n# - https://jupyterlab.readthedocs.io/en/stable/index.html #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Note: With the JupyterLab build 'with-pre-installed-packages', packages #\n# cannot be added during the notebook runtime. #\n# #\n# ------------------------------------------------------------------------------- #\n\njupyterlab==3.0.3\nnotebook>=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR <=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR < Date: Wed, 6 Dec 2023 09:56:45 +0800 Subject: [PATCH 5/6] SOF-7123: allow both U anv V context providers for DFT+U+V --- assets/espresso/pw_scf_dft_v.j2.in | 3 +++ src/js/data/templates.js | 2 +- templates/espresso/pw.x.yml | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/assets/espresso/pw_scf_dft_v.j2.in b/assets/espresso/pw_scf_dft_v.j2.in index 557a4f9e..d27451ce 100644 --- a/assets/espresso/pw_scf_dft_v.j2.in +++ b/assets/espresso/pw_scf_dft_v.j2.in @@ -42,6 +42,9 @@ CELL_PARAMETERS angstrom K_POINTS automatic {% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %} HUBBARD {ortho-atomic} +{% for row in hubbard_u -%} +U {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }} +{% endfor -%} {% for row in hubbard_v -%} V {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }} {% endfor -%} diff --git a/src/js/data/templates.js b/src/js/data/templates.js index 8be783ed..029f136f 100644 --- a/src/js/data/templates.js +++ b/src/js/data/templates.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {allTemplates: [{"content":"1\npp.dat\n1.0\n3000\n3\n3.0000\n","name":"average.in","contextProviders":[],"applicationName":"espresso","executableName":"average.x"},{"content":"&BANDS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n filband = {% raw %}'{{ JOB_WORK_DIR }}/bands.dat'{% endraw %}\n no_overlap = .true.\n/\n\n","name":"bands.in","contextProviders":[],"applicationName":"espresso","executableName":"bands.x"},{"content":"&DOS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n/\n\n","name":"dos.in","contextProviders":[],"applicationName":"espresso","executableName":"dos.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"dynmat_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"dynmat.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! truncation (used for both correlation and exchange)\n truncation = '2d'\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of the Coulomb solver\n thres_coul = 1.0d-2\n\n ! configuration of W in the convolution\n model_coul = 'godby-needs'\n max_freq_coul = 120\n num_freq_coul = 35\n\n ! configuration of the Green solver\n thres_green = 1.0d-3\n max_iter_green = 300\n\n ! configuration for the correlation self energy\n ecut_corr = 5.0\n max_freq_corr = 100.0\n num_freq_corr = 11\n\n ! configuration for the exchange self energy\n ecut_exch = 20.0\n\n ! configuration for the output\n eta = 0.1\n min_freq_wind = -30.0\n max_freq_wind = 30.0\n num_freq_wind = 601\n/\n\n&gw_output\n/\n\nFREQUENCIES\n2\n 0.0 0.0\n 0.0 10.0\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_plasmon_pole.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of W in the convolution\n max_freq_coul = 200\n num_freq_coul = 51\n\n ! configuration for the correlation self energy\n ecut_corr = 6.0\n\n ! configuration for the exchange self energy\n ecut_exch = 15.0\n/\n\n&gw_output\n/\n\nFREQUENCIES\n35\n 0.0 0.0\n 0.0 0.3\n 0.0 0.9\n 0.0 1.8\n 0.0 3.0\n 0.0 4.5\n 0.0 6.3\n 0.0 8.4\n 0.0 10.8\n 0.0 13.5\n 0.0 16.5\n 0.0 19.8\n 0.0 23.4\n 0.0 27.3\n 0.0 31.5\n 0.0 36.0\n 0.0 40.8\n 0.0 45.9\n 0.0 51.3\n 0.0 57.0\n 0.0 63.0\n 0.0 69.3\n 0.0 75.9\n 0.0 82.8\n 0.0 90.0\n 0.0 97.5\n 0.0 105.3\n 0.0 113.4\n 0.0 121.8\n 0.0 130.5\n 0.0 139.5\n 0.0 148.8\n 0.0 158.4\n 0.0 168.3\n 0.0 178.5\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_full_frequency.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"matdyn_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc ='force_constants.fc'\n flfrq ='frequencies.freq'\n flvec ='normal_modes.out'\n q_in_band_form = .true.\n /\n{{ipath.length}}\n{% for point in ipath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"matdyn_path.in","contextProviders":[{"name":"IPathFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"BEGIN\nBEGIN_PATH_INPUT\n&PATH\n restart_mode = 'from_scratch'\n string_method = 'neb',\n nstep_path = 50,\n ds = 2.D0,\n opt_scheme = \"broyden\",\n num_of_images = {{ 2 + (input.INTERMEDIATE_IMAGES.length || neb.nImages) }},\n k_max = 0.3D0,\n k_min = 0.2D0,\n CI_scheme = \"auto\",\n path_thr = 0.1D0,\n/\nEND_PATH_INPUT\nBEGIN_ENGINE_INPUT\n&CONTROL\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.03\n nspin = 2\n starting_magnetization = 0.5\n/\n&ELECTRONS\n conv_thr = 1.D-8\n mixing_beta = 0.3\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nBEGIN_POSITIONS\nFIRST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.FIRST_IMAGE }}\n{%- for IMAGE in input.INTERMEDIATE_IMAGES %}\nINTERMEDIATE_IMAGE\nATOMIC_POSITIONS crystal\n{{ IMAGE }}\n{%- endfor %}\nLAST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.LAST_IMAGE }}\nEND_POSITIONS\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nEND_ENGINE_INPUT\nEND\n","name":"neb.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"NEBFormDataManager"},{"name":"QENEBInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"neb.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n ldisp = .true.\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n{% for point in qpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor %}\n{% endfor %}\n","name":"ph_path.in","contextProviders":[{"name":"QPathFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n0 0 0\n","name":"ph_gamma.in","contextProviders":[],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .false.\n start_irr = 0\n last_irr = 0\n ldisp = .true.\n fildyn = 'dyn0'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_init_qpoints.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .true.\n ldisp = .true.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid_restart.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18\n ldisp = .true.\n {% raw -%}\n start_q = {{MAP_DATA.qpoint}}\n last_q = {{MAP_DATA.qpoint}}\n start_irr = {{MAP_DATA.irr}}\n last_irr= {{MAP_DATA.irr}}\n {%- endraw %}\n recover = .true.\n fildyn = 'dyn'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_SCRATCH_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_single_irr_qpt.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 0\n/\n&PLOT\n iflag = 3\n output_format = 5\n fileout ='density.xsf'\n/\n\n","name":"pp_density.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 11\n/\n","name":"pp_electrostatic_potential.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&PROJWFC\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n deltaE = 0.05\n/\n","name":"projwfc.in","contextProviders":[],"applicationName":"espresso","executableName":"projwfc.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n input_dft = 'hse',\n {% for d in qgrid.dimensions -%}\n nqx{{loop.index}} = {{d}}\n {% endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal\n{{ '{{' }} {{ explicitKPath.length }} {% raw %} + KPOINTS|length }} {% endraw %}\n{%- raw %}\n{% for point in KPOINTS -%}\n {% for d in point.coordinates %}{{ \"%14.9f\"|format(d) }} {% endfor -%}{{ point.weight }}\n{% endfor %}\n{% endraw -%}\n{% for point in explicitKPath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}0.0000001\n{% endfor %}\n","name":"pw_scf_bands_hse.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPathFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% raw %}{{PARAMETER | default('1')}} {{PARAMETER | default('1')}} {{PARAMETER | default('1')}} 0 0 0{% endraw %}\n","name":"pw_scf_kpt_conv.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'nscf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n{%- if subworkflowContext.NO_SYMMETRY_NO_INVERSION %}\n nosym = .true.\n noinv = .true.\n{%- endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_nscf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n nstep = 50\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'vc-relax'\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_vc_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'bands'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal_b\n{{kpath.length}}\n{% for point in kpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"pw_bands.in","contextProviders":[{"name":"KPathFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n ecutfock = 100\n occupations = 'smearing'\n degauss = 0.005\n input_dft='hse',\n nqx1 = {% if kgrid.dimensions[0]%2 == 0 %}{{kgrid.dimensions[0]/2}}{% else %}{{(kgrid.dimensions[0]+1)/2}}{% endif %}, nqx2 = {% if kgrid.dimensions[1]%2 == 0 %}{{kgrid.dimensions[1]/2}}{% else %}{{(kgrid.dimensions[1]+1)/2}}{% endif %}, nqx3 = {% if kgrid.dimensions[2]%2 == 0 %}{{kgrid.dimensions[2]/2}}{% else %}{{(kgrid.dimensions[2]+1)/2}}{% endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{% if d%2 == 0 %}{{d}} {% else %}{{d+1}} {% endif %}{% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_hse.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_u -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n","name":"pw_scf_dft_u.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardUContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_v -%}\nV {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }}\n{% endfor -%}\n","name":"pw_scf_dft_v.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardVContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_j -%}\n{{ row.paramType }} {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.value }}\n{% endfor -%}\n","name":"pw_scf_dft_j.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardJContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n lda_plus_u = .true.\n lda_plus_u_kind = 0\n U_projection_type = 'ortho-atomic'\n {%- for row in hubbard_legacy %}\n Hubbard_U({{ row.atomicSpeciesIndex }}) = {{ row.hubbardUValue }}\n {%- endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_dft_u_legacy.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManagerLegacy"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&INPUT\n fildyn = 'dyn'\n zasr = 'simple'\n flfrc = 'force_constants.fc'\n/\n","name":"q2r.in","contextProviders":[],"applicationName":"espresso","executableName":"q2r.x"},{"content":"&inputhp\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {%- for d in qgrid.dimensions %}\n nq{{ loop.index }} = {{ d }}\n {%- endfor %}\n/\n","name":"hp.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"hp.x"},{"content":"&inputpp\n calculation = \"eps\"\n prefix = \"__prefix__\"\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n\n&energy_grid\n smeartype = \"gauss\"\n intersmear = 0.2\n intrasmear = 0.0\n wmin = 0.0\n wmax = 30.0\n nw = 500\n shift = 0.0\n/\n\n","name":"epsilon.in","contextProviders":[],"applicationName":"espresso","executableName":"epsilon.x"},{"content":"# ------------------------------------------------------------------------------- #\n# #\n# Example JupyterLab requirements #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. A Python virtual environment is created #\n# - in /scratch/$USER/$JOB_ID (for build: 'Default') #\n# - in /export/share/python/ (for build: 'with-pre-installed-packages') #\n# 3. This list is used to populate a Python virtual environment #\n# 4. JupyterLab is started #\n# #\n# For more information visit: #\n# - https://jupyterlab.readthedocs.io/en/stable/index.html #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Note: With the JupyterLab build 'with-pre-installed-packages', packages #\n# cannot be added during the notebook runtime. #\n# #\n# ------------------------------------------------------------------------------- #\n\njupyterlab==3.0.3\nnotebook>=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR <=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR < Date: Fri, 8 Dec 2023 14:03:58 +0800 Subject: [PATCH 6/6] SOF-7123: rename template names pw_scf_dft_v -> pw_scf_dft_u+v, pw_scf_dft_j -> pw_scf_dft_u+j --- executables/espresso/pw.x.yml | 4 ++-- src/js/data/tree.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/executables/espresso/pw.x.yml b/executables/espresso/pw.x.yml index 4f3b37fb..13a19265 100644 --- a/executables/espresso/pw.x.yml +++ b/executables/espresso/pw.x.yml @@ -96,7 +96,7 @@ flavors: supportedApplicationVersions: - '7.2' - pw_scf_dft_v: + pw_scf_dft_u+v: input: - name: pw_scf_dft_v.in results: @@ -116,7 +116,7 @@ flavors: supportedApplicationVersions: - '7.2' - pw_scf_dft_j: + pw_scf_dft_u+j: input: - name: pw_scf_dft_j.in results: diff --git a/src/js/data/tree.js b/src/js/data/tree.js index 96cfd373..37f32cac 100644 --- a/src/js/data/tree.js +++ b/src/js/data/tree.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {applicationTree: {"espresso":{"average.x":{"monitors":["standard_output"],"results":["average_potential_profile"],"flavors":{"average":{"input":[{"name":"average.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"average.x"},"average_potential":{"input":[{"name":"average.in"}],"results":["average_potential_profile"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"average.x"}}},"bands.x":{"monitors":["standard_output"],"results":["band_structure"],"flavors":{"bands":{"input":[{"name":"bands.in"}],"results":["band_structure"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"bands.x"}}},"dos.x":{"monitors":["standard_output"],"results":["density_of_states"],"flavors":{"dos":{"input":[{"name":"dos.in"}],"results":["density_of_states"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"dos.x"}}},"dynmat.x":{"monitors":["standard_output"],"results":[],"flavors":{"dynmat":{"input":[{"name":"dynmat_grid.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"dynmat.x"}}},"gw.x":{"monitors":["standard_output"],"results":["band_structure","fermi_energy","band_gaps"],"flavors":{"gw_bands_plasmon_pole":{"input":[{"name":"gw_bands_plasmon_pole.in"}],"results":["band_structure","fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"gw.x"},"gw_bands_full_frequency":{"input":[{"name":"gw_bands_full_frequency.in"}],"results":["band_structure","fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"gw.x"}}},"matdyn.x":{"monitors":["standard_output"],"results":["phonon_dos","phonon_dispersions"],"flavors":{"matdyn_grid":{"input":[{"name":"matdyn_grid.in"}],"monitors":["standard_output"],"results":["phonon_dos"],"applicationName":"espresso","executableName":"matdyn.x"},"matdyn_path":{"input":[{"name":"matdyn_path.in"}],"monitors":["standard_output"],"results":["phonon_dispersions"],"applicationName":"espresso","executableName":"matdyn.x"}}},"neb.x":{"monitors":["standard_output"],"results":["reaction_energy_barrier","reaction_energy_profile"],"flavors":{"neb":{"isMultiMaterial":true,"input":[{"name":"neb.in"}],"results":["reaction_energy_barrier","reaction_energy_profile"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"neb.x"}}},"ph.x":{"monitors":["standard_output"],"results":["phonon_dos","phonon_dispersions","zero_point_energy"],"flavors":{"ph_path":{"input":[{"name":"ph_path.in"}],"results":["phonon_dispersions"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_grid":{"input":[{"name":"ph_grid.in"}],"results":["phonon_dos"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_gamma":{"input":[{"name":"ph_gamma.in"}],"results":["zero_point_energy"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_init_qpoints":{"input":[{"name":"ph_init_qpoints.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_grid_restart":{"input":[{"name":"ph_grid_restart.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_single_irr_qpt":{"input":[{"name":"ph_single_irr_qpt.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"}}},"pp.x":{"monitors":["standard_output"],"results":[],"flavors":{"pp_density":{"input":[{"name":"pp_density.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pp.x"},"pp_electrostatic_potential":{"input":[{"name":"pp_electrostatic_potential.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pp.x"}}},"projwfc.x":{"monitors":["standard_output"],"results":["density_of_states"],"flavors":{"projwfc":{"input":[{"name":"projwfc.in"}],"results":["density_of_states"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"projwfc.x"}}},"pw.x":{"isDefault":true,"hasAdvancedComputeOptions":true,"postProcessors":["remove_non_zero_weight_kpoints"],"monitors":["standard_output","convergence_ionic","convergence_electronic"],"results":["atomic_forces","band_gaps","charge_density_profile","density_of_states","fermi_energy","final_structure","magnetic_moments","potential_profile","pressure","reaction_energy_barrier","reaction_energy_profile","stress_tensor","total_energy","total_energy_contributions","total_force"],"flavors":{"pw_scf":{"isDefault":true,"input":[{"name":"pw_scf.in"}],"results":["atomic_forces","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_bands_hse":{"input":[{"name":"pw_scf_bands_hse.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_hse":{"input":[{"name":"pw_scf_hse.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_dft_u":{"input":[{"name":"pw_scf_dft_u.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_v":{"input":[{"name":"pw_scf_dft_v.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_j":{"input":[{"name":"pw_scf_dft_j.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_u_legacy":{"input":[{"name":"pw_scf_dft_u_legacy.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["5.2.1","5.4.0","6.0.0","6.3","6.4.1","6.5.0","6.6.0","6.7.0","6.8.0","7.0"]},"pw_esm":{"input":[{"name":"pw_esm.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor","potential_profile","charge_density_profile"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_esm_relax":{"input":[{"name":"pw_esm_relax.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor","potential_profile","charge_density_profile"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_nscf":{"input":[{"name":"pw_nscf.in"}],"results":["fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pw.x"},"pw_bands":{"input":[{"name":"pw_bands.in"}],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pw.x"},"pw_relax":{"input":[{"name":"pw_relax.in"}],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor","final_structure"],"applicationName":"espresso","executableName":"pw.x"},"pw_vc-relax":{"input":[{"name":"pw_vc_relax.in"}],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor","final_structure"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_kpt_conv":{"input":[{"name":"pw_scf_kpt_conv.in"}],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"}}},"q2r.x":{"monitors":["standard_output"],"results":[],"flavors":{"q2r":{"input":[{"name":"q2r.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"q2r.x"}}},"hp.x":{"monitors":["standard_output"],"results":[],"flavors":{"hp":{"input":[{"name":"hp.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"hp.x"}},"supportedApplicationVersions":["7.0","7.2"]},"epsilon.x":{"monitors":["standard_output"],"results":["dielectric_tensor"],"flavors":{"dielectric_tensor":{"input":[{"name":"epsilon.in"}],"results":["dielectric_tensor"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"epsilon.x"}}}},"jupyterLab":{"jupyter":{"isDefault":true,"monitors":["standard_output","jupyter_notebook_endpoint"],"results":[],"flavors":{"notebook":{"isDefault":true,"input":[{"name":"requirements.txt","templateName":"requirements.txt"}],"monitors":["standard_output","jupyter_notebook_endpoint"],"applicationName":"jupyterLab","executableName":"jupyter"}}}},"exabyteml":{"score":{"isDefault":false,"monitors":["standard_output"],"results":["predicted_properties"],"flavors":{"score":{"isDefault":true,"input":[],"monitors":["standard_output"],"applicationName":"exabyteml","executableName":"score"}}},"train":{"isDefault":true,"monitors":["standard_output"],"results":["workflow:ml_predict"],"flavors":{"train":{"isDefault":true,"input":[],"monitors":["standard_output"],"applicationName":"exabyteml","executableName":"train"}}}},"nwchem":{"nwchem":{"isDefault":true,"hasAdvancedComputeOptions":false,"postProcessors":["error_handler"],"monitors":["standard_output"],"results":["total_energy","total_energy_contributions"],"flavors":{"nwchem_total_energy":{"isDefault":true,"input":[{"name":"nwchem_total_energy.inp"}],"results":["total_energy","total_energy_contributions"],"monitors":["standard_output"],"applicationName":"nwchem","executableName":"nwchem"}}}},"python":{"python":{"isDefault":true,"monitors":["standard_output"],"results":["file_content","workflow:pyml_predict"],"flavors":{"hello_world":{"isDefault":true,"input":[{"name":"script.py","templateName":"hello_world.py"},{"name":"requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"espresso_xml_get_qpt_irr":{"input":[{"name":"espresso_xml_get_qpt_irr.py"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"espresso_extract_kpoints":{"input":[{"name":"espresso_extract_kpoints.py"},{"name":"requirements.txt","templateName":"requirements_empty.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"generic:post_processing:plot:matplotlib":{"input":[{"name":"plot.py","templateName":"matplotlib_basic.py"},{"name":"requirements.txt","templateName":"processing_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"generic:processing:find_extrema:scipy":{"input":[{"name":"find_extrema.py","templateName":"find_extrema.py"},{"name":"requirements.txt","templateName":"processing_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:setup_variables_packages":{"input":[{"name":"settings.py","templateName":"pyml_settings.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:custom":{"input":[{"name":"pyml_custom.py","templateName":"pyml_custom.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:data_input:read_csv:pandas":{"input":[{"name":"data_input_read_csv_pandas.py","templateName":"data_input_read_csv_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:data_input:train_test_split:sklearn":{"input":[{"name":"data_input_train_test_split_sklearn.py","templateName":"data_input_train_test_split_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:min_max_scaler:sklearn":{"input":[{"name":"pre_processing_min_max_sklearn.py","templateName":"pre_processing_min_max_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:remove_duplicates:pandas":{"input":[{"name":"pre_processing_remove_duplicates_pandas.py","templateName":"pre_processing_remove_duplicates_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:remove_missing:pandas":{"input":[{"name":"pre_processing_remove_missing_pandas.py","templateName":"pre_processing_remove_missing_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:standardization:sklearn":{"input":[{"name":"pre_processing_standardization_sklearn.py","templateName":"pre_processing_standardization_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:adaboosted_trees_regression:sklearn":{"input":[{"name":"model_adaboosted_trees_regression_sklearn.py","templateName":"model_adaboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"results":["workflow:pyml_predict"],"applicationName":"python","executableName":"python"},"pyml:model:bagged_trees_regression:sklearn":{"input":[{"name":"model_bagged_trees_regression_sklearn.py","templateName":"model_bagged_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:gradboosted_trees_regression:sklearn":{"input":[{"name":"model_gradboosted_trees_regression_sklearn.py","templateName":"model_gradboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:extreme_gradboosted_trees_regression:sklearn":{"input":[{"name":"model_extreme_gradboosted_trees_regression_sklearn.py","templateName":"model_extreme_gradboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:k_means_clustering:sklearn":{"input":[{"name":"model_k_means_clustering_sklearn.py","templateName":"model_k_means_clustering_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:kernel_ridge_regression:sklearn":{"input":[{"name":"model_kernel_ridge_regression_sklearn.py","templateName":"model_kernel_ridge_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:lasso_regression:sklearn":{"input":[{"name":"model_lasso_regression_sklearn.py","templateName":"model_lasso_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:multilayer_perceptron:sklearn":{"input":[{"name":"model_mlp_sklearn.py","templateName":"model_mlp_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:random_forest_classification:sklearn":{"input":[{"name":"model_random_forest_classification_sklearn.py","templateName":"model_random_forest_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:gradboosted_trees_classification:sklearn":{"input":[{"name":"model_gradboosted_trees_classification_sklearn.py","templateName":"model_gradboosted_trees_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:extreme_gradboosted_trees_classification:sklearn":{"input":[{"name":"model_extreme_gradboosted_trees_classification_sklearn.py","templateName":"model_extreme_gradboosted_trees_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:random_forest_regression:sklearn":{"input":[{"name":"model_random_forest_regression_sklearn.py","templateName":"model_random_forest_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:ridge_regression:sklearn":{"input":[{"name":"model_ridge_regression_sklearn.py","templateName":"model_ridge_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:parity_plot:matplotlib":{"input":[{"name":"post_processing_parity_plot_matplotlib.py","templateName":"post_processing_parity_plot_matplotlib.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:pca_2d_clusters:matplotlib":{"input":[{"name":"post_processing_pca_2d_clusters_matplotlib.py","templateName":"post_processing_pca_2d_clusters_matplotlib.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:roc_curve:sklearn":{"input":[{"name":"post_processing_roc_curve_sklearn.py","templateName":"post_processing_roc_curve_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"}}}},"shell":{"sh":{"isDefault":true,"monitors":["standard_output"],"results":["atomic_forces","band_gaps","band_structure","density_of_states","fermi_energy","phonon_dispersions","phonon_dos","pressure","stress_tensor","total_energy","total_energy_contributions","total_force","zero_point_energy","final_structure","magnetic_moments","reaction_energy_barrier","reaction_energy_profile","potential_profile","charge_density_profile"],"flavors":{"hello_world":{"isDefault":true,"input":[{"name":"hello_world.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"job_espresso_pw_scf":{"input":[{"name":"job_espresso_pw_scf.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"espresso_link_outdir_save":{"input":[{"name":"espresso_link_outdir_save.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"espresso_collect_dynmat":{"input":[{"name":"espresso_collect_dynmat.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"bash_vasp_prepare_neb_images":{"isMultiMaterial":true,"input":[{"name":"bash_vasp_prepare_neb_images.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"}}}},"vasp":{"vasp":{"isDefault":true,"postProcessors":["error_handler","prepare_restart","remove_non_zero_weight_kpoints"],"monitors":["standard_output","convergence_ionic","convergence_electronic"],"results":["atomic_forces","band_gaps","band_structure","density_of_states","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force","zero_point_energy","final_structure","magnetic_moments","reaction_energy_barrier","reaction_energy_profile","potential_profile","charge_density_profile"],"flavors":{"vasp":{"isDefault":true,"input":[{"name":"INCAR"},{"name":"KPOINTS"},{"name":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_bands":{"input":[{"name":"INCAR","templateName":"INCAR_BANDS"},{"name":"KPOINTS","templateName":"KPOINTS_BANDS"},{"name":"POSCAR","templateName":""}],"results":["band_structure"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_nscf":{"input":[{"name":"INCAR","templateName":"INCAR_BANDS"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["band_gaps","fermi_energy"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_HSE"},{"name":"KPOINTS"},{"name":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_bands_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_BANDS_HSE"},{"name":"KPOINTS","templateName":"KPOINTS_BANDS"},{"name":"POSCAR","templateName":""}],"results":["band_structure"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_nscf_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_BANDS_HSE"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["band_gaps","fermi_energy"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_relax":{"input":[{"name":"INCAR","templateName":"INCAR_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","atomic_forces","fermi_energy","pressure","stress_tensor","total_force","final_structure"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"postProcessors":["prepare_restart"],"applicationName":"vasp","executableName":"vasp"},"vasp_vc_relax":{"input":[{"name":"INCAR","templateName":"INCAR_VC_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","atomic_forces","fermi_energy","pressure","stress_tensor","total_force","final_structure"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"postProcessors":["prepare_restart"],"applicationName":"vasp","executableName":"vasp"},"vasp_zpe":{"input":[{"name":"INCAR","templateName":"INCAR_ZPE"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","fermi_energy","pressure","atomic_forces","stress_tensor","total_force","zero_point_energy"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"applicationName":"vasp","executableName":"vasp"},"vasp_kpt_conv":{"input":[{"name":"INCAR","templateName":"INCAR"},{"name":"KPOINTS","templateName":"KPOINTS_CONV"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_vc_relax_conv":{"input":[{"name":"INCAR","templateName":"INCAR_VC_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS_CONV"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB"},{"name":"KPOINTS","templateName":"KPOINTS"}],"results":["reaction_energy_barrier","reaction_energy_profile"],"monitors":["standard_output"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb_initial":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB_INITIAL_FINAL"},{"name":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR_NEB_INITIAL"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb_final":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB_INITIAL_FINAL"},{"name":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR_NEB_FINAL"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"}}}}}} +module.exports = {applicationTree: {"espresso":{"average.x":{"monitors":["standard_output"],"results":["average_potential_profile"],"flavors":{"average":{"input":[{"name":"average.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"average.x"},"average_potential":{"input":[{"name":"average.in"}],"results":["average_potential_profile"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"average.x"}}},"bands.x":{"monitors":["standard_output"],"results":["band_structure"],"flavors":{"bands":{"input":[{"name":"bands.in"}],"results":["band_structure"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"bands.x"}}},"dos.x":{"monitors":["standard_output"],"results":["density_of_states"],"flavors":{"dos":{"input":[{"name":"dos.in"}],"results":["density_of_states"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"dos.x"}}},"dynmat.x":{"monitors":["standard_output"],"results":[],"flavors":{"dynmat":{"input":[{"name":"dynmat_grid.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"dynmat.x"}}},"gw.x":{"monitors":["standard_output"],"results":["band_structure","fermi_energy","band_gaps"],"flavors":{"gw_bands_plasmon_pole":{"input":[{"name":"gw_bands_plasmon_pole.in"}],"results":["band_structure","fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"gw.x"},"gw_bands_full_frequency":{"input":[{"name":"gw_bands_full_frequency.in"}],"results":["band_structure","fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"gw.x"}}},"matdyn.x":{"monitors":["standard_output"],"results":["phonon_dos","phonon_dispersions"],"flavors":{"matdyn_grid":{"input":[{"name":"matdyn_grid.in"}],"monitors":["standard_output"],"results":["phonon_dos"],"applicationName":"espresso","executableName":"matdyn.x"},"matdyn_path":{"input":[{"name":"matdyn_path.in"}],"monitors":["standard_output"],"results":["phonon_dispersions"],"applicationName":"espresso","executableName":"matdyn.x"}}},"neb.x":{"monitors":["standard_output"],"results":["reaction_energy_barrier","reaction_energy_profile"],"flavors":{"neb":{"isMultiMaterial":true,"input":[{"name":"neb.in"}],"results":["reaction_energy_barrier","reaction_energy_profile"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"neb.x"}}},"ph.x":{"monitors":["standard_output"],"results":["phonon_dos","phonon_dispersions","zero_point_energy"],"flavors":{"ph_path":{"input":[{"name":"ph_path.in"}],"results":["phonon_dispersions"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_grid":{"input":[{"name":"ph_grid.in"}],"results":["phonon_dos"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_gamma":{"input":[{"name":"ph_gamma.in"}],"results":["zero_point_energy"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_init_qpoints":{"input":[{"name":"ph_init_qpoints.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_grid_restart":{"input":[{"name":"ph_grid_restart.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"},"ph_single_irr_qpt":{"input":[{"name":"ph_single_irr_qpt.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"ph.x"}}},"pp.x":{"monitors":["standard_output"],"results":[],"flavors":{"pp_density":{"input":[{"name":"pp_density.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pp.x"},"pp_electrostatic_potential":{"input":[{"name":"pp_electrostatic_potential.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pp.x"}}},"projwfc.x":{"monitors":["standard_output"],"results":["density_of_states"],"flavors":{"projwfc":{"input":[{"name":"projwfc.in"}],"results":["density_of_states"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"projwfc.x"}}},"pw.x":{"isDefault":true,"hasAdvancedComputeOptions":true,"postProcessors":["remove_non_zero_weight_kpoints"],"monitors":["standard_output","convergence_ionic","convergence_electronic"],"results":["atomic_forces","band_gaps","charge_density_profile","density_of_states","fermi_energy","final_structure","magnetic_moments","potential_profile","pressure","reaction_energy_barrier","reaction_energy_profile","stress_tensor","total_energy","total_energy_contributions","total_force"],"flavors":{"pw_scf":{"isDefault":true,"input":[{"name":"pw_scf.in"}],"results":["atomic_forces","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_bands_hse":{"input":[{"name":"pw_scf_bands_hse.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_hse":{"input":[{"name":"pw_scf_hse.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_dft_u":{"input":[{"name":"pw_scf_dft_u.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_u+v":{"input":[{"name":"pw_scf_dft_v.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_u+j":{"input":[{"name":"pw_scf_dft_j.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["7.2"]},"pw_scf_dft_u_legacy":{"input":[{"name":"pw_scf_dft_u_legacy.in"}],"results":["atomic_forces","band_gaps","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x","supportedApplicationVersions":["5.2.1","5.4.0","6.0.0","6.3","6.4.1","6.5.0","6.6.0","6.7.0","6.8.0","7.0"]},"pw_esm":{"input":[{"name":"pw_esm.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor","potential_profile","charge_density_profile"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_esm_relax":{"input":[{"name":"pw_esm_relax.in"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor","potential_profile","charge_density_profile"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"},"pw_nscf":{"input":[{"name":"pw_nscf.in"}],"results":["fermi_energy","band_gaps"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pw.x"},"pw_bands":{"input":[{"name":"pw_bands.in"}],"monitors":["standard_output"],"applicationName":"espresso","executableName":"pw.x"},"pw_relax":{"input":[{"name":"pw_relax.in"}],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor","final_structure"],"applicationName":"espresso","executableName":"pw.x"},"pw_vc-relax":{"input":[{"name":"pw_vc_relax.in"}],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor","final_structure"],"applicationName":"espresso","executableName":"pw.x"},"pw_scf_kpt_conv":{"input":[{"name":"pw_scf_kpt_conv.in"}],"results":["total_energy","fermi_energy","pressure","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"espresso","executableName":"pw.x"}}},"q2r.x":{"monitors":["standard_output"],"results":[],"flavors":{"q2r":{"input":[{"name":"q2r.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"q2r.x"}}},"hp.x":{"monitors":["standard_output"],"results":[],"flavors":{"hp":{"input":[{"name":"hp.in"}],"results":[],"monitors":["standard_output"],"applicationName":"espresso","executableName":"hp.x"}},"supportedApplicationVersions":["7.0","7.2"]},"epsilon.x":{"monitors":["standard_output"],"results":["dielectric_tensor"],"flavors":{"dielectric_tensor":{"input":[{"name":"epsilon.in"}],"results":["dielectric_tensor"],"monitors":["standard_output"],"applicationName":"espresso","executableName":"epsilon.x"}}}},"jupyterLab":{"jupyter":{"isDefault":true,"monitors":["standard_output","jupyter_notebook_endpoint"],"results":[],"flavors":{"notebook":{"isDefault":true,"input":[{"name":"requirements.txt","templateName":"requirements.txt"}],"monitors":["standard_output","jupyter_notebook_endpoint"],"applicationName":"jupyterLab","executableName":"jupyter"}}}},"exabyteml":{"score":{"isDefault":false,"monitors":["standard_output"],"results":["predicted_properties"],"flavors":{"score":{"isDefault":true,"input":[],"monitors":["standard_output"],"applicationName":"exabyteml","executableName":"score"}}},"train":{"isDefault":true,"monitors":["standard_output"],"results":["workflow:ml_predict"],"flavors":{"train":{"isDefault":true,"input":[],"monitors":["standard_output"],"applicationName":"exabyteml","executableName":"train"}}}},"nwchem":{"nwchem":{"isDefault":true,"hasAdvancedComputeOptions":false,"postProcessors":["error_handler"],"monitors":["standard_output"],"results":["total_energy","total_energy_contributions"],"flavors":{"nwchem_total_energy":{"isDefault":true,"input":[{"name":"nwchem_total_energy.inp"}],"results":["total_energy","total_energy_contributions"],"monitors":["standard_output"],"applicationName":"nwchem","executableName":"nwchem"}}}},"python":{"python":{"isDefault":true,"monitors":["standard_output"],"results":["file_content","workflow:pyml_predict"],"flavors":{"hello_world":{"isDefault":true,"input":[{"name":"script.py","templateName":"hello_world.py"},{"name":"requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"espresso_xml_get_qpt_irr":{"input":[{"name":"espresso_xml_get_qpt_irr.py"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"espresso_extract_kpoints":{"input":[{"name":"espresso_extract_kpoints.py"},{"name":"requirements.txt","templateName":"requirements_empty.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"generic:post_processing:plot:matplotlib":{"input":[{"name":"plot.py","templateName":"matplotlib_basic.py"},{"name":"requirements.txt","templateName":"processing_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"generic:processing:find_extrema:scipy":{"input":[{"name":"find_extrema.py","templateName":"find_extrema.py"},{"name":"requirements.txt","templateName":"processing_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:setup_variables_packages":{"input":[{"name":"settings.py","templateName":"pyml_settings.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:custom":{"input":[{"name":"pyml_custom.py","templateName":"pyml_custom.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:data_input:read_csv:pandas":{"input":[{"name":"data_input_read_csv_pandas.py","templateName":"data_input_read_csv_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:data_input:train_test_split:sklearn":{"input":[{"name":"data_input_train_test_split_sklearn.py","templateName":"data_input_train_test_split_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:min_max_scaler:sklearn":{"input":[{"name":"pre_processing_min_max_sklearn.py","templateName":"pre_processing_min_max_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:remove_duplicates:pandas":{"input":[{"name":"pre_processing_remove_duplicates_pandas.py","templateName":"pre_processing_remove_duplicates_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:remove_missing:pandas":{"input":[{"name":"pre_processing_remove_missing_pandas.py","templateName":"pre_processing_remove_missing_pandas.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:pre_processing:standardization:sklearn":{"input":[{"name":"pre_processing_standardization_sklearn.py","templateName":"pre_processing_standardization_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:adaboosted_trees_regression:sklearn":{"input":[{"name":"model_adaboosted_trees_regression_sklearn.py","templateName":"model_adaboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"monitors":["standard_output"],"results":["workflow:pyml_predict"],"applicationName":"python","executableName":"python"},"pyml:model:bagged_trees_regression:sklearn":{"input":[{"name":"model_bagged_trees_regression_sklearn.py","templateName":"model_bagged_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:gradboosted_trees_regression:sklearn":{"input":[{"name":"model_gradboosted_trees_regression_sklearn.py","templateName":"model_gradboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:extreme_gradboosted_trees_regression:sklearn":{"input":[{"name":"model_extreme_gradboosted_trees_regression_sklearn.py","templateName":"model_extreme_gradboosted_trees_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:k_means_clustering:sklearn":{"input":[{"name":"model_k_means_clustering_sklearn.py","templateName":"model_k_means_clustering_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:kernel_ridge_regression:sklearn":{"input":[{"name":"model_kernel_ridge_regression_sklearn.py","templateName":"model_kernel_ridge_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:lasso_regression:sklearn":{"input":[{"name":"model_lasso_regression_sklearn.py","templateName":"model_lasso_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:multilayer_perceptron:sklearn":{"input":[{"name":"model_mlp_sklearn.py","templateName":"model_mlp_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:random_forest_classification:sklearn":{"input":[{"name":"model_random_forest_classification_sklearn.py","templateName":"model_random_forest_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:gradboosted_trees_classification:sklearn":{"input":[{"name":"model_gradboosted_trees_classification_sklearn.py","templateName":"model_gradboosted_trees_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:extreme_gradboosted_trees_classification:sklearn":{"input":[{"name":"model_extreme_gradboosted_trees_classification_sklearn.py","templateName":"model_extreme_gradboosted_trees_classification_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:random_forest_regression:sklearn":{"input":[{"name":"model_random_forest_regression_sklearn.py","templateName":"model_random_forest_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:model:ridge_regression:sklearn":{"input":[{"name":"model_ridge_regression_sklearn.py","templateName":"model_ridge_regression_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["workflow:pyml_predict"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:parity_plot:matplotlib":{"input":[{"name":"post_processing_parity_plot_matplotlib.py","templateName":"post_processing_parity_plot_matplotlib.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:pca_2d_clusters:matplotlib":{"input":[{"name":"post_processing_pca_2d_clusters_matplotlib.py","templateName":"post_processing_pca_2d_clusters_matplotlib.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"},"pyml:post_processing:roc_curve:sklearn":{"input":[{"name":"post_processing_roc_curve_sklearn.py","templateName":"post_processing_roc_curve_sklearn.py"},{"name":"requirements.txt","templateName":"pyml_requirements.txt"}],"results":["file_content"],"monitors":["standard_output"],"applicationName":"python","executableName":"python"}}}},"shell":{"sh":{"isDefault":true,"monitors":["standard_output"],"results":["atomic_forces","band_gaps","band_structure","density_of_states","fermi_energy","phonon_dispersions","phonon_dos","pressure","stress_tensor","total_energy","total_energy_contributions","total_force","zero_point_energy","final_structure","magnetic_moments","reaction_energy_barrier","reaction_energy_profile","potential_profile","charge_density_profile"],"flavors":{"hello_world":{"isDefault":true,"input":[{"name":"hello_world.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"job_espresso_pw_scf":{"input":[{"name":"job_espresso_pw_scf.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"espresso_link_outdir_save":{"input":[{"name":"espresso_link_outdir_save.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"espresso_collect_dynmat":{"input":[{"name":"espresso_collect_dynmat.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"},"bash_vasp_prepare_neb_images":{"isMultiMaterial":true,"input":[{"name":"bash_vasp_prepare_neb_images.sh"}],"monitors":["standard_output"],"applicationName":"shell","executableName":"sh"}}}},"vasp":{"vasp":{"isDefault":true,"postProcessors":["error_handler","prepare_restart","remove_non_zero_weight_kpoints"],"monitors":["standard_output","convergence_ionic","convergence_electronic"],"results":["atomic_forces","band_gaps","band_structure","density_of_states","fermi_energy","pressure","stress_tensor","total_energy","total_energy_contributions","total_force","zero_point_energy","final_structure","magnetic_moments","reaction_energy_barrier","reaction_energy_profile","potential_profile","charge_density_profile"],"flavors":{"vasp":{"isDefault":true,"input":[{"name":"INCAR"},{"name":"KPOINTS"},{"name":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_bands":{"input":[{"name":"INCAR","templateName":"INCAR_BANDS"},{"name":"KPOINTS","templateName":"KPOINTS_BANDS"},{"name":"POSCAR","templateName":""}],"results":["band_structure"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_nscf":{"input":[{"name":"INCAR","templateName":"INCAR_BANDS"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["band_gaps","fermi_energy"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_HSE"},{"name":"KPOINTS"},{"name":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_bands_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_BANDS_HSE"},{"name":"KPOINTS","templateName":"KPOINTS_BANDS"},{"name":"POSCAR","templateName":""}],"results":["band_structure"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_nscf_hse":{"isDefault":false,"input":[{"name":"INCAR","templateName":"INCAR_BANDS_HSE"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["band_gaps","fermi_energy"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_relax":{"input":[{"name":"INCAR","templateName":"INCAR_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","atomic_forces","fermi_energy","pressure","stress_tensor","total_force","final_structure"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"postProcessors":["prepare_restart"],"applicationName":"vasp","executableName":"vasp"},"vasp_vc_relax":{"input":[{"name":"INCAR","templateName":"INCAR_VC_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","atomic_forces","fermi_energy","pressure","stress_tensor","total_force","final_structure"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"postProcessors":["prepare_restart"],"applicationName":"vasp","executableName":"vasp"},"vasp_zpe":{"input":[{"name":"INCAR","templateName":"INCAR_ZPE"},{"name":"KPOINTS","templateName":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","fermi_energy","pressure","atomic_forces","stress_tensor","total_force","zero_point_energy"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"applicationName":"vasp","executableName":"vasp"},"vasp_kpt_conv":{"input":[{"name":"INCAR","templateName":"INCAR"},{"name":"KPOINTS","templateName":"KPOINTS_CONV"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_vc_relax_conv":{"input":[{"name":"INCAR","templateName":"INCAR_VC_RELAX"},{"name":"KPOINTS","templateName":"KPOINTS_CONV"},{"name":"POSCAR","templateName":"POSCAR"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic","convergence_ionic"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB"},{"name":"KPOINTS","templateName":"KPOINTS"}],"results":["reaction_energy_barrier","reaction_energy_profile"],"monitors":["standard_output"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb_initial":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB_INITIAL_FINAL"},{"name":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR_NEB_INITIAL"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"},"vasp_neb_final":{"isMultiMaterial":true,"input":[{"name":"INCAR","templateName":"INCAR_NEB_INITIAL_FINAL"},{"name":"KPOINTS"},{"name":"POSCAR","templateName":"POSCAR_NEB_FINAL"}],"results":["total_energy","total_energy_contributions","pressure","fermi_energy","atomic_forces","total_force","stress_tensor"],"monitors":["standard_output","convergence_electronic"],"applicationName":"vasp","executableName":"vasp"}}}}}}