diff --git a/applications/application_data.yml b/applications/application_data.yml index b43c59bf..9dbed3f0 100644 --- a/applications/application_data.yml +++ b/applications/application_data.yml @@ -1,3 +1,4 @@ +deepmd: !include 'applications/deepmd.yml' espresso: !include 'applications/espresso.yml' exabyteml: !include 'applications/exabyteml.yml' jupyterLab: !include 'applications/jupyterLab.yml' diff --git a/applications/deepmd.yml b/applications/deepmd.yml new file mode 100644 index 00000000..d6b058c8 --- /dev/null +++ b/applications/deepmd.yml @@ -0,0 +1,7 @@ +name: deepmd +shortName: deepmd +summary: DeePMD +defaultVersion: '2.0.2' +versions: + - version: '2.0.2' + isDefault: true diff --git a/assets/deepmd/dp_train_se_e2_r.j2.json b/assets/deepmd/dp_train_se_e2_r.j2.json new file mode 100644 index 00000000..884b21f2 --- /dev/null +++ b/assets/deepmd/dp_train_se_e2_r.j2.json @@ -0,0 +1,67 @@ +{ + "model": { + "type_map": [ + "O", + "H" + ], + "descriptor": { + "type": "se_e2_r", + "sel": [ + 23, + 46 + ], + "rcut_smth": 0.50, + "rcut": 5.00, + "neuron": [ + 5, + 5, + 5 + ], + "resnet_dt": false, + "seed": 1 + }, + "fitting_net": { + "neuron": [ + 60, + 60, + 60 + ], + "resnet_dt": true, + "seed": 1 + } + }, + "learning_rate": { + "type": "exp", + "decay_steps": 1000, + "start_lr": 0.005, + "stop_lr": 3.51e-6 + }, + "loss": { + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0 + }, + "training": { + "training_data": { + "systems": [ + "./training/" + ], + "batch_size": "auto" + }, + "validation_data": { + "systems": [ + "./validation/" + ], + "batch_size": "auto" + }, + "numb_steps": 301, + "seed": 1, + "disp_file": "lcurve.out", + "disp_freq": 10, + "numb_test": 1, + "save_freq": 100 + } +} diff --git a/assets/deepmd/espresso_cp_to_deepmd.j2.py b/assets/deepmd/espresso_cp_to_deepmd.j2.py new file mode 100644 index 00000000..d092102b --- /dev/null +++ b/assets/deepmd/espresso_cp_to_deepmd.j2.py @@ -0,0 +1,23 @@ +import dpdata +import numpy as np + +# https://docs.deepmodeling.com/projects/dpdata/en/master/formats/QECPTrajFormat.html +data = dpdata.LabeledSystem("cp", fmt="qe/cp/traj") +print("Dataset contains total {0} frames".format(len(data))) + +# randomly choose 20% index for validation_data +size = len(data) +size_validation = round(size * 0.2) + +index_validation = np.random.choice(size, size=size_validation, replace=False) +index_training = list(set(range(size)) - set(index_validation)) + +data_training = data.sub_system(index_training) +data_validation = data.sub_system(index_validation) + +print("Using {0} frames as training set".format(len(data_training))) +print("Using {0} frames as validation set".format(len(data_validation))) + +# save training and validation sets +data_training.to_deepmd_npy("./training") +data_validation.to_deepmd_npy("./validation") diff --git a/assets/deepmd/espresso_to_lammps_structure.j2.py b/assets/deepmd/espresso_to_lammps_structure.j2.py new file mode 100644 index 00000000..98841825 --- /dev/null +++ b/assets/deepmd/espresso_to_lammps_structure.j2.py @@ -0,0 +1,10 @@ +import dpdata + +# load cp data files +# https://docs.deepmodeling.com/projects/dpdata/en/master/formats/QECPTrajFormat.html +system = dpdata.LabeledSystem("cp", fmt="qe/cp/traj") + +# convert dpdata to lammps format +# below procedure will convert input QE structure to lammps format, user may +# want to generate supercell or other complex structure for lammps calculation +system.to_lmp("system.lmp") diff --git a/assets/deepmd/in.lammps b/assets/deepmd/in.lammps new file mode 100644 index 00000000..c51c63df --- /dev/null +++ b/assets/deepmd/in.lammps @@ -0,0 +1,25 @@ +# LAMMPS input, deepmd-kit version. Built from bulk water calculation. + +units metal +boundary p p p +atom_style atomic + +neighbor 2.0 bin +neigh_modify every 10 delay 0 check no + +read_data system.lmp +mass 1 16 +mass 2 2 + +pair_style deepmd ./graph.pb +pair_coeff * * + +velocity all create 330.0 23456789 + +fix 1 all nvt temp 330.0 330.0 0.5 +timestep 0.0005 +thermo_style custom step pe ke etotal temp press vol +thermo 100 +dump 1 all custom 100 system.dump id type x y z + +run 100 diff --git a/assets/espresso/cp.j2.in b/assets/espresso/cp.j2.in index 0068d9be..9cb1b5f5 100644 --- a/assets/espresso/cp.j2.in +++ b/assets/espresso/cp.j2.in @@ -27,6 +27,9 @@ electron_dynamics = 'verlet' electron_velocities = 'zero' emass = {{ dynamics.electronMass }} +!! consider the below parameters if orthogonalization fails +! orthogonalization = 'ortho' +! ortho_eps = 1d-11 / &IONS ion_dynamics = 'verlet' diff --git a/executables/deepmd/dp.yml b/executables/deepmd/dp.yml new file mode 100644 index 00000000..ae20caa8 --- /dev/null +++ b/executables/deepmd/dp.yml @@ -0,0 +1,13 @@ +isDefault: false +monitors: + - standard_output +results: [] +flavors: + dp_train_se_e2_r: + input: + - name: dp_train_se_e2_r.json + results: [] + monitors: + - standard_output + applicationName: deepmd + executableName: dp # dp train dp_train_se_e2_r.json && dp freeze -o graph.pb diff --git a/executables/deepmd/lmp.yml b/executables/deepmd/lmp.yml new file mode 100644 index 00000000..cd6c5eab --- /dev/null +++ b/executables/deepmd/lmp.yml @@ -0,0 +1,13 @@ +isDefault: false +monitors: + - standard_output +results: [] +flavors: + lammps_md: + input: + - name: in.lammps + results: [] + monitors: + - standard_output + applicationName: deepmd + executableName: lmp # mpirun -np $NP lmp -in in.lammps diff --git a/executables/deepmd/python.yml b/executables/deepmd/python.yml new file mode 100644 index 00000000..5f0aef0b --- /dev/null +++ b/executables/deepmd/python.yml @@ -0,0 +1,24 @@ +isDefault: true +monitors: + - standard_output +results: [] +flavors: + espresso_cp_to_deepmd: + isDefault: true + input: + - name: espresso_cp_to_deepmd.py + results: [] + monitors: + - standard_output + applicationName: deepmd + executableName: python + + espresso_to_lammps_structure: + isDefault: false + input: + - name: espresso_to_lammps_structure.py + results: [] + monitors: + - standard_output + applicationName: deepmd + executableName: python diff --git a/executables/espresso/cp.x.yml b/executables/espresso/cp.x.yml index a9a2017c..c600efc8 100644 --- a/executables/espresso/cp.x.yml +++ b/executables/espresso/cp.x.yml @@ -3,6 +3,7 @@ monitors: results: [] flavors: cp: + isDefault: true input: - name: cp.in results: [] diff --git a/executables/tree.yml b/executables/tree.yml index ffad2b0b..a587f5ac 100644 --- a/executables/tree.yml +++ b/executables/tree.yml @@ -1,3 +1,7 @@ +deepmd: + dp: !include 'executables/deepmd/dp.yml' + lmp: !include 'executables/deepmd/lmp.yml' + python: !include 'executables/deepmd/python.yml' espresso: average.x: !include 'executables/espresso/average.x.yml' bands.x: !include 'executables/espresso/bands.x.yml' diff --git a/src/js/data/application_data.js b/src/js/data/application_data.js index e41042a0..c44a7d09 100644 --- a/src/js/data/application_data.js +++ b/src/js/data/application_data.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {applicationData: {"espresso":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","defaultVersion":"6.3","versions":[{"version":"5.2.1","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"5.2.1","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"5.2.1","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.4.1","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.5.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.6.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.7.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.8.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true}]},"exabyteml":{"name":"exabyteml","shortName":"ml","summary":"Exabyte Machine Learning Engine","defaultVersion":"0.2.0","versions":[{"version":"0.2.0","isDefault":true}]},"jupyterLab":{"name":"jupyterLab","shortName":"jl","summary":"JupyterLab","defaultVersion":"3.0.3","versions":[{"version":"3.0.3","isDefault":true},{"version":"3.0.3","isDefault":false,"build":"with-pre-installed-packages"}]},"nwchem":{"name":"nwchem","shortName":"nwchem","summary":"NWChem","defaultVersion":"7.0.2","versions":[{"version":"7.0.2","isDefault":true},{"version":"6.6"}]},"python":{"name":"python","shortName":"py","summary":"Python Script","defaultVersion":"3.8.6","versions":[{"version":"3.6.12"},{"version":"3.7.9"},{"version":"3.8.6","isDefault":true},{"version":"3.9.1"},{"version":"anaconda3-5.2.0"}]},"shell":{"name":"shell","shortName":"sh","summary":"Shell Script","defaultVersion":"4.2.46","versions":[{"version":"4.2.46","isDefault":true}]},"vasp":{"name":"vasp","shortName":"vasp","summary":"Vienna Ab-initio Simulation Package","defaultVersion":"5.3.5","isLicensed":true,"versions":[{"version":"5.3.5","isDefault":true},{"version":"5.3.5","isDefault":false,"build":"Non-collinear"},{"version":"5.3.5","isDefault":false,"build":"VTST"},{"version":"5.4.4","isDefault":true},{"version":"5.4.4","isDefault":false,"build":"Gamma"},{"version":"5.4.4","isDefault":false,"build":"Non-collinear"},{"version":"5.4.4","isDefault":false,"build":"VTST"},{"version":"5.4.4","isDefault":false,"build":"VTST-Gamma"},{"version":"5.4.4","isDefault":false,"build":"VTST-Non-collinear"}]}}} +module.exports = {applicationData: {"deepmd":{"name":"deepmd","shortName":"deepmd","summary":"DeePMD","defaultVersion":"2.0.2","versions":[{"version":"2.0.2","isDefault":true}]},"espresso":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","defaultVersion":"6.3","versions":[{"version":"5.2.1","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"5.2.1","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"5.2.1","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"5.4.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"6.0.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"6.3","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"6.4.1","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.5.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.6.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.7.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"6.8.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true},{"version":"7.0","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":true,"hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":false,"build":"Intel","hasAdvancedComputeOptions":true},{"version":"7.2","isDefault":false,"build":"GNU","hasAdvancedComputeOptions":true}]},"exabyteml":{"name":"exabyteml","shortName":"ml","summary":"Exabyte Machine Learning Engine","defaultVersion":"0.2.0","versions":[{"version":"0.2.0","isDefault":true}]},"jupyterLab":{"name":"jupyterLab","shortName":"jl","summary":"JupyterLab","defaultVersion":"3.0.3","versions":[{"version":"3.0.3","isDefault":true},{"version":"3.0.3","isDefault":false,"build":"with-pre-installed-packages"}]},"nwchem":{"name":"nwchem","shortName":"nwchem","summary":"NWChem","defaultVersion":"7.0.2","versions":[{"version":"7.0.2","isDefault":true},{"version":"6.6"}]},"python":{"name":"python","shortName":"py","summary":"Python Script","defaultVersion":"3.8.6","versions":[{"version":"3.6.12"},{"version":"3.7.9"},{"version":"3.8.6","isDefault":true},{"version":"3.9.1"},{"version":"anaconda3-5.2.0"}]},"shell":{"name":"shell","shortName":"sh","summary":"Shell Script","defaultVersion":"4.2.46","versions":[{"version":"4.2.46","isDefault":true}]},"vasp":{"name":"vasp","shortName":"vasp","summary":"Vienna Ab-initio Simulation Package","defaultVersion":"5.3.5","isLicensed":true,"versions":[{"version":"5.3.5","isDefault":true},{"version":"5.3.5","isDefault":false,"build":"Non-collinear"},{"version":"5.3.5","isDefault":false,"build":"VTST"},{"version":"5.4.4","isDefault":true},{"version":"5.4.4","isDefault":false,"build":"Gamma"},{"version":"5.4.4","isDefault":false,"build":"Non-collinear"},{"version":"5.4.4","isDefault":false,"build":"VTST"},{"version":"5.4.4","isDefault":false,"build":"VTST-Gamma"},{"version":"5.4.4","isDefault":false,"build":"VTST-Non-collinear"}]}}} diff --git a/src/js/data/templates.js b/src/js/data/templates.js index faa262fd..ae735d40 100644 --- a/src/js/data/templates.js +++ b/src/js/data/templates.js @@ -1,2 +1,2 @@ /* eslint-disable */ -module.exports = {allTemplates: [{"content":"1\npp.dat\n1.0\n3000\n3\n3.0000\n","name":"average.in","contextProviders":[],"applicationName":"espresso","executableName":"average.x"},{"content":"&BANDS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n filband = {% raw %}'{{ JOB_WORK_DIR }}/bands.dat'{% endraw %}\n no_overlap = .true.\n/\n\n","name":"bands.in","contextProviders":[],"applicationName":"espresso","executableName":"bands.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'cp'\n restart_mode = '{{ input.RESTART_MODE }}'\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}'{% endraw %}\n prefix = 'cp'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n nstep = {{ dynamics.numberOfSteps }}\n iprint = 1\n dt = {{ dynamics.timeStep }}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n nr1b = 20\n nr2b = 20\n nr3b = 20\n/\n&ELECTRONS\n electron_dynamics = 'verlet'\n electron_velocities = 'zero'\n emass = {{ dynamics.electronMass }}\n/\n&IONS\n ion_dynamics = 'verlet'\n ion_velocities = 'zero'\n tempw = {{ dynamics.temperature }}\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\n","name":"cp.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"IonDynamicsContextProvider"}],"applicationName":"espresso","executableName":"cp.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'cp-wf'\n restart_mode = '{{ input.RESTART_MODE }}'\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}'{% endraw %}\n prefix = 'cp_wf'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n nstep = {{ dynamics.numberOfSteps }}\n iprint = 1\n dt = {{ dynamics.timeStep }}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n! TODO: figure out the runtime error when `ecutrho` is set. \n! In the meantime, using Norm-conserving pseudopotentials works.\n! ecutrho = {{ cutoffs.density }}\n nr1b = 20\n nr2b = 20\n nr3b = 20\n/\n&ELECTRONS\n electron_dynamics = 'verlet'\n electron_velocities = 'zero'\n emass = {{ dynamics.electronMass }}\n/\n&IONS\n ion_dynamics = 'verlet'\n ion_velocities = 'zero'\n tempw = {{ dynamics.temperature }}\n/\n&CELL\n/\n&WANNIER\n nit = 60,\n calwf = 3,\n tolw = 1.D-6,\n nsteps = 50,\n adapt = .FALSE.\n wfdt = 2.0D0,\n wf_q = 500.D0,\n wf_friction = 0.3d0,\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\n","name":"cp_wf.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"IonDynamicsContextProvider"}],"applicationName":"espresso","executableName":"cp.x"},{"content":"&DOS\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n/\n\n","name":"dos.in","contextProviders":[],"applicationName":"espresso","executableName":"dos.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"dynmat_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"dynmat.x"},{"content":"&inputpp\n calculation = \"eps\"\n prefix = \"__prefix__\"\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n\n&energy_grid\n smeartype = \"gauss\"\n intersmear = 0.2\n intrasmear = 0.0\n wmin = 0.0\n wmax = 30.0\n nw = 500\n shift = 0.0\n/\n\n","name":"epsilon.in","contextProviders":[],"applicationName":"espresso","executableName":"epsilon.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! truncation (used for both correlation and exchange)\n truncation = '2d'\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of the Coulomb solver\n thres_coul = 1.0d-2\n\n ! configuration of W in the convolution\n model_coul = 'godby-needs'\n max_freq_coul = 120\n num_freq_coul = 35\n\n ! configuration of the Green solver\n thres_green = 1.0d-3\n max_iter_green = 300\n\n ! configuration for the correlation self energy\n ecut_corr = 5.0\n max_freq_corr = 100.0\n num_freq_corr = 11\n\n ! configuration for the exchange self energy\n ecut_exch = 20.0\n\n ! configuration for the output\n eta = 0.1\n min_freq_wind = -30.0\n max_freq_wind = 30.0\n num_freq_wind = 601\n/\n\n&gw_output\n/\n\nFREQUENCIES\n2\n 0.0 0.0\n 0.0 10.0\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_plasmon_pole.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&gw_input\n\n ! see http://www.sternheimergw.org for more information.\n\n ! config of the scf run\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n\n ! the grid used for the linear response\n kpt_grid = {{ kgrid.dimensions|join(', ') }}\n qpt_grid = {{ qgrid.dimensions|join(', ') }}\n\n ! number of bands for which the GW correction is calculated\n num_band = 8\n\n ! configuration of W in the convolution\n max_freq_coul = 200\n num_freq_coul = 51\n\n ! configuration for the correlation self energy\n ecut_corr = 6.0\n\n ! configuration for the exchange self energy\n ecut_exch = 15.0\n/\n\n&gw_output\n/\n\nFREQUENCIES\n35\n 0.0 0.0\n 0.0 0.3\n 0.0 0.9\n 0.0 1.8\n 0.0 3.0\n 0.0 4.5\n 0.0 6.3\n 0.0 8.4\n 0.0 10.8\n 0.0 13.5\n 0.0 16.5\n 0.0 19.8\n 0.0 23.4\n 0.0 27.3\n 0.0 31.5\n 0.0 36.0\n 0.0 40.8\n 0.0 45.9\n 0.0 51.3\n 0.0 57.0\n 0.0 63.0\n 0.0 69.3\n 0.0 75.9\n 0.0 82.8\n 0.0 90.0\n 0.0 97.5\n 0.0 105.3\n 0.0 113.4\n 0.0 121.8\n 0.0 130.5\n 0.0 139.5\n 0.0 148.8\n 0.0 158.4\n 0.0 168.3\n 0.0 178.5\n/\n\nK_points\n{{ explicitKPath2PIBA.length }}\n{% for point in explicitKPath2PIBA -%}\n{% for coordinate in point.coordinates %}{{ coordinate }}{% endfor %}\n{% endfor %}\n/\n\n","name":"gw_bands_full_frequency.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPath2PIBAFormDataManager"}],"applicationName":"espresso","executableName":"gw.x"},{"content":"&inputhp\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {%- for d in qgrid.dimensions %}\n nq{{ loop.index }} = {{ d }}\n {%- endfor %}\n/\n","name":"hp.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"hp.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc = 'force_constants.fc'\n flfrq = 'frequencies.freq'\n dos = .true.\n fldos = 'phonon_dos.out'\n deltaE = 1.d0\n {% for d in igrid.dimensions -%}\n nk{{loop.index}} = {{d}}\n {% endfor %}\n /\n","name":"matdyn_grid.in","contextProviders":[{"name":"IGridFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"&INPUT\n asr = 'simple'\n flfrc ='force_constants.fc'\n flfrq ='frequencies.freq'\n flvec ='normal_modes.out'\n q_in_band_form = .true.\n /\n{{ipath.length}}\n{% for point in ipath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"matdyn_path.in","contextProviders":[{"name":"IPathFormDataManager"}],"applicationName":"espresso","executableName":"matdyn.x"},{"content":"BEGIN\nBEGIN_PATH_INPUT\n&PATH\n restart_mode = 'from_scratch'\n string_method = 'neb',\n nstep_path = 50,\n ds = 2.D0,\n opt_scheme = \"broyden\",\n num_of_images = {{ 2 + (input.INTERMEDIATE_IMAGES.length || neb.nImages) }},\n k_max = 0.3D0,\n k_min = 0.2D0,\n CI_scheme = \"auto\",\n path_thr = 0.1D0,\n/\nEND_PATH_INPUT\nBEGIN_ENGINE_INPUT\n&CONTROL\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.03\n nspin = 2\n starting_magnetization = 0.5\n/\n&ELECTRONS\n conv_thr = 1.D-8\n mixing_beta = 0.3\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nBEGIN_POSITIONS\nFIRST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.FIRST_IMAGE }}\n{%- for IMAGE in input.INTERMEDIATE_IMAGES %}\nINTERMEDIATE_IMAGE\nATOMIC_POSITIONS crystal\n{{ IMAGE }}\n{%- endfor %}\nLAST_IMAGE\nATOMIC_POSITIONS crystal\n{{ input.LAST_IMAGE }}\nEND_POSITIONS\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nEND_ENGINE_INPUT\nEND\n","name":"neb.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"NEBFormDataManager"},{"name":"QENEBInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"neb.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n ldisp = .true.\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n{% for point in qpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor %}\n{% endfor %}\n","name":"ph_path.in","contextProviders":[{"name":"QPathFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-12\n asr = .true.\n search_sym = .false.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n/\n0 0 0\n","name":"ph_gamma.in","contextProviders":[],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .false.\n start_irr = 0\n last_irr = 0\n ldisp = .true.\n fildyn = 'dyn0'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_init_qpoints.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18,\n recover = .true.\n ldisp = .true.\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n fildyn = 'dyn'\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_grid_restart.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPH\n tr2_ph = 1.0d-18\n ldisp = .true.\n {% raw -%}\n start_q = {{MAP_DATA.qpoint}}\n last_q = {{MAP_DATA.qpoint}}\n start_irr = {{MAP_DATA.irr}}\n last_irr= {{MAP_DATA.irr}}\n {%- endraw %}\n recover = .true.\n fildyn = 'dyn'\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_SCRATCH_DIR }}/outdir'{% endraw %}\n {% for d in qgrid.dimensions -%}\n nq{{loop.index}} = {{d}}\n {% endfor %}\n/\n","name":"ph_single_irr_qpt.in","contextProviders":[{"name":"QGridFormDataManager"}],"applicationName":"espresso","executableName":"ph.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 0\n/\n&PLOT\n iflag = 3\n output_format = 5\n fileout ='density.xsf'\n/\n\n","name":"pp_density.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&INPUTPP\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n filplot = 'pp.dat'\n plot_num = 11\n/\n","name":"pp_electrostatic_potential.in","contextProviders":[],"applicationName":"espresso","executableName":"pp.x"},{"content":"&PROJWFC\n prefix = '__prefix__'\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n degauss = 0.01\n deltaE = 0.05\n/\n","name":"projwfc.in","contextProviders":[],"applicationName":"espresso","executableName":"projwfc.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n input_dft = 'hse',\n {% for d in qgrid.dimensions -%}\n nqx{{loop.index}} = {{d}}\n {% endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal\n{{ '{{' }} {{ explicitKPath.length }} {% raw %} + KPOINTS|length }} {% endraw %}\n{%- raw %}\n{% for point in KPOINTS -%}\n {% for d in point.coordinates %}{{ \"%14.9f\"|format(d) }} {% endfor -%}{{ point.weight }}\n{% endfor %}\n{% endraw -%}\n{% for point in explicitKPath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}0.0000001\n{% endfor %}\n","name":"pw_scf_bands_hse.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"QGridFormDataManager"},{"name":"ExplicitKPathFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n assume_isolated = 'esm'\n esm_bc = '{{ boundaryConditions.type }}'\n fcp_mu = {{ boundaryConditions.targetFermiEnergy }}\n esm_w = {{ boundaryConditions.offset }}\n esm_efield = {{ boundaryConditions.electricField }}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_esm_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"BoundaryConditionsFormDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% raw %}{{PARAMETER | default('1')}} {{PARAMETER | default('1')}} {{PARAMETER | default('1')}} 0 0 0{% endraw %}\n","name":"pw_scf_kpt_conv.in","contextProviders":[{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'nscf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n{%- if subworkflowContext.NO_SYMMETRY_NO_INVERSION %}\n nosym = .true.\n noinv = .true.\n{%- endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_nscf.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'relax'\n nstep = 50\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&CONTROL\n calculation = 'vc-relax'\n title = ''\n verbosity = 'low'\n restart_mode = 'from_scratch'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_vc_relax.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'bands'\n title = ''\n verbosity = 'low'\n restart_mode = '{{input.RESTART_MODE}}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'smearing'\n degauss = 0.005\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS crystal_b\n{{kpath.length}}\n{% for point in kpath -%}\n{% for d in point.coordinates %}{{d}} {% endfor -%}{{point.steps}}\n{% endfor %}\n","name":"pw_bands.in","contextProviders":[{"name":"KPathFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n ecutfock = 100\n occupations = 'smearing'\n degauss = 0.005\n input_dft='hse',\n nqx1 = {% if kgrid.dimensions[0]%2 == 0 %}{{kgrid.dimensions[0]/2}}{% else %}{{(kgrid.dimensions[0]+1)/2}}{% endif %}, nqx2 = {% if kgrid.dimensions[1]%2 == 0 %}{{kgrid.dimensions[1]/2}}{% else %}{{(kgrid.dimensions[1]+1)/2}}{% endif %}, nqx3 = {% if kgrid.dimensions[2]%2 == 0 %}{{kgrid.dimensions[2]/2}}{% else %}{{(kgrid.dimensions[2]+1)/2}}{% endif %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{% if d%2 == 0 %}{{d}} {% else %}{{d+1}} {% endif %}{% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_hse.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_u -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n","name":"pw_scf_dft_u.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardUContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_u -%}\nU {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.hubbardUValue }}\n{% endfor -%}\n{% for row in hubbard_v -%}\nV {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.atomicSpecies2 }}-{{ row.atomicOrbital2 }} {{ row.siteIndex }} {{ row.siteIndex2 }} {{ row.hubbardVValue }}\n{% endfor -%}\n","name":"pw_scf_dft_v.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardUContextManager"},{"name":"HubbardVContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\nHUBBARD {ortho-atomic}\n{% for row in hubbard_j -%}\n{{ row.paramType }} {{ row.atomicSpecies }}-{{ row.atomicOrbital }} {{ row.value }}\n{% endfor -%}\n","name":"pw_scf_dft_j.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardJContextManager"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"{% if subworkflowContext.MATERIAL_INDEX %}\n{%- set input = input.perMaterial[subworkflowContext.MATERIAL_INDEX] -%}\n{% endif -%}\n&CONTROL\n calculation = 'scf'\n title = ''\n verbosity = 'low'\n restart_mode = '{{ input.RESTART_MODE }}'\n wf_collect = .true.\n tstress = .true.\n tprnfor = .true.\n outdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n wfcdir = {% raw %}'{{ JOB_WORK_DIR }}/outdir'{% endraw %}\n prefix = '__prefix__'\n pseudo_dir = {% raw %}'{{ JOB_WORK_DIR }}/pseudo'{% endraw %}\n/\n&SYSTEM\n ibrav = {{ input.IBRAV }}\n nat = {{ input.NAT }}\n ntyp = {{ input.NTYP }}\n ecutwfc = {{ cutoffs.wavefunction }}\n ecutrho = {{ cutoffs.density }}\n occupations = 'fixed'\n lda_plus_u = .true.\n lda_plus_u_kind = 0\n U_projection_type = 'ortho-atomic'\n {%- for row in hubbard_legacy %}\n Hubbard_U({{ row.atomicSpeciesIndex }}) = {{ row.hubbardUValue }}\n {%- endfor %}\n/\n&ELECTRONS\n diagonalization = 'david'\n diago_david_ndim = 4\n diago_full_acc = .true.\n mixing_beta = 0.3\n startingwfc = 'atomic+random'\n/\n&IONS\n/\n&CELL\n/\nATOMIC_SPECIES\n{{ input.ATOMIC_SPECIES }}\nATOMIC_POSITIONS crystal\n{{ input.ATOMIC_POSITIONS }}\nCELL_PARAMETERS angstrom\n{{ input.CELL_PARAMETERS }}\nK_POINTS automatic\n{% for d in kgrid.dimensions %}{{d}} {% endfor %}{% for s in kgrid.shifts %}{{s}} {% endfor %}\n","name":"pw_scf_dft_u_legacy.in","contextProviders":[{"name":"KGridFormDataManager"},{"name":"QEPWXInputDataManager"},{"name":"PlanewaveCutoffDataManager"},{"name":"HubbardContextManagerLegacy"}],"applicationName":"espresso","executableName":"pw.x"},{"content":"&INPUT\n fildyn = 'dyn'\n zasr = 'simple'\n flfrc = 'force_constants.fc'\n/\n","name":"q2r.in","contextProviders":[],"applicationName":"espresso","executableName":"q2r.x"},{"content":"# ------------------------------------------------------------------------------- #\n# #\n# Example JupyterLab requirements #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. A Python virtual environment is created #\n# - in /scratch/$USER/$JOB_ID (for build: 'Default') #\n# - in /export/share/python/ (for build: 'with-pre-installed-packages') #\n# 3. This list is used to populate a Python virtual environment #\n# 4. JupyterLab is started #\n# #\n# For more information visit: #\n# - https://jupyterlab.readthedocs.io/en/stable/index.html #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Note: With the JupyterLab build 'with-pre-installed-packages', packages #\n# cannot be added during the notebook runtime. #\n# #\n# ------------------------------------------------------------------------------- #\n\njupyterlab==3.0.3\nnotebook>=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR <=6.2.0\nexabyte-api-client>=2020.10.19\nnumpy>=1.17.3\npandas>=1.1.4\nurllib3<2\n","name":"requirements.txt","contextProviders":[],"applicationName":"jupyterLab","executableName":"jupyter"},{"content":" start nwchem\n title \"Test\"\n charge {{ input.CHARGE }}\n geometry units au noautosym\n {{ input.ATOMIC_POSITIONS }}\n end\n basis\n * library {{ input.BASIS }}\n end\n dft\n xc {{ input.FUNCTIONAL }}\n mult {{ input.MULT }}\n end\n task dft energy\n","name":"nwchem_total_energy.inp","contextProviders":[{"name":"NWChemInputDataManager"}],"applicationName":"nwchem","executableName":"nwchem"},{"content":"# ---------------------------------------------------------------- #\n# #\n# Example python script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. runtime directory for this calculation is created #\n# 2. requirements.txt is used to create a virtual environment #\n# 3. virtual environment is activated #\n# 4. python process running this script is started #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\nimport pymatgen as mg\n\nsi = mg.Element(\"Si\")\n\nprint(si.atomic_mass)\n","name":"hello_world.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2\n","name":"requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------------ #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# Please add any packages required for this unit below following #\n# the requirements.txt specification: #\n# https://pip.pypa.io/en/stable/reference/requirements-file-format/ #\n# ------------------------------------------------------------------ #\n","name":"requirements_empty.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# -------------------------------------------------------------------------------\n# This script contains a few helpful commands for basic plotting with matplotlib.\n# The commented out blocks are optional suggestions and included for convenience.\n# -------------------------------------------------------------------------------\nimport matplotlib.pyplot as plt\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# Plot Settings\n# -------------\nfigure_size = (6.4, 4.8) # width, height [inches]\ndpi = 100 # resolution [dots-per-inch]\nfont_size_title = 16 # font size of title\nfont_size_axis = 12 # font size of axis label\nfont_size_tick = 12 # font size of tick label\nfont_size_legend = 14 # font size of legend\nx_axis_label = None # label for x-axis\ny_axis_label = None # label for y-axis\ntitle = None # figure title\nshow_legend = False # whether to show legend\nsave_name = \"plot.pdf\" # output filename (with suffix), e.g. 'plot.pdf'\nx_view_limits = {\"left\": None, \"right\": None} # view limits for x-axis\ny_view_limits = {\"top\": None, \"bottom\": None} # view limits for y-axis\nx_tick_spacing = None # custom tick spacing for x-axis (optional)\ny_tick_spacing = None # custom tick spacing for y-axis (optional)\nx_tick_labels = None # custom tick labels for x-axis (optional)\ny_tick_labels = None # custom tick labels for y-axis (optional)\n\n\n# Figure & axes objects\n# ---------------------\nfig = plt.figure(figsize=figure_size, dpi=dpi)\nax = fig.add_subplot(111)\n\n# Example plot (REPLACE ACCORDINGLY)\n# ------------\nx = np.linspace(0, 7, num=100)\ny = np.sin(x)\nax.plot(x, y, \"g-\", zorder=3)\n\n\n# Help lines\n# ----------\n# ax.axhline(y=0, color=\"0.25\", linewidth=0.6, zorder=1)\n# ax.axvline(x=0, color=\"0.25\", linewidth=0.6, zorder=1)\n\n\n# View limits\n# -----------\nax.set_xlim(**x_view_limits)\nax.set_ylim(**y_view_limits)\n\n\n# Grid lines\n# ----------\n# grid_style = {\n# \"linestyle\" : \"dotted\",\n# \"linewidth\" : 0.6,\n# \"color\" : \"0.25\",\n# }\n# ax.grid(**grid_style)\n\n# Custom tick spacing\n# -------------------\n# ax.xaxis.set_major_locator(ticker.MultipleLocator(x_tick_spacing))\n# ax.yaxis.set_major_locator(ticker.MultipleLocator(y_tick_spacing))\n\n# Custom tick labels\n# ------------------\nif x_tick_labels is not None:\n ax.set_xticklabels(x_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\nif y_tick_labels is not None:\n ax.set_yticklabels(y_tick_labels, fontdict={\"fontsize\": font_size_tick}, minor=False)\n\n# Other tick settings\n# -------------------\n# ax.tick_params(axis=\"both\", which=\"major\", labelsize=font_size_tick, direction=\"in\")\n# ax.tick_params(axis=\"x\", which=\"major\", pad=10)\n# ax.tick_params(axis=\"x\", which=\"minor\", bottom=False, top=False)\n\n\n# Axis labels\n# -----------\nif x_axis_label is not None:\n ax.set_xlabel(x_axis_label, size=font_size_axis)\nif y_axis_label is not None:\n ax.set_ylabel(y_axis_label, size=font_size_axis)\n\n# Figure title\n# ------------\nif title is not None:\n ax.set_title(title, fontsize=font_size_title)\n\n# Legend\n# ------\nif show_legend:\n ax.legend(prop={'size': font_size_legend})\n\n# Save figure\n# -----------\nif save_name is not None:\n save_format = save_name.split(\".\")[-1]\n fig.savefig(save_name, format=save_format, bbox_inches=\"tight\")\n","name":"matplotlib_basic.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ---------------------------------------------------------- #\n# #\n# This script extracts q-points and irreducible #\n# representations from Quantum ESPRESSO xml data. #\n# #\n# Expects control_ph.xml and patterns.?.xml files to exist #\n# #\n# ---------------------------------------------------------- #\nfrom __future__ import print_function\nimport json\nfrom xml.dom import minidom\n\n{# JOB_WORK_DIR will be initialized at runtime => avoid substituion below #}\n{%- raw -%}\nCONTROL_PH_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/control_ph.xml\"\nPATTERNS_FILENAME = \"{{JOB_WORK_DIR}}/outdir/_ph0/__prefix__.phsave/patterns.{}.xml\"\n{%- endraw -%}\n\n# get integer content of an xml tag in a document\ndef get_int_by_tag_name(doc, tag_name):\n element = doc.getElementsByTagName(tag_name)\n return int(element[0].firstChild.nodeValue)\n\nvalues = []\n\n# get number of q-points and cycle through them\nxmldoc = minidom.parse(CONTROL_PH_FILENAME)\nnumber_of_qpoints = get_int_by_tag_name(xmldoc, \"NUMBER_OF_Q_POINTS\")\n\nfor i in range(number_of_qpoints):\n # get number of irreducible representations per qpoint\n xmldoc = minidom.parse(PATTERNS_FILENAME.format(i+1))\n number_of_irr_per_qpoint = get_int_by_tag_name(xmldoc, \"NUMBER_IRR_REP\")\n # add each distinct combination of qpoint and irr as a separate entry\n for j in range(number_of_irr_per_qpoint):\n values.append({\n \"qpoint\": i + 1,\n \"irr\": j + 1\n })\n\n# store final values in standard output (STDOUT)\nprint(json.dumps(values, indent=4))\n","name":"espresso_xml_get_qpt_irr.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"import re\nimport json\n\ndouble_regex = r'[-+]?\\d*\\.\\d+(?:[eE][-+]?\\d+)?'\nregex = r\"\\s+k\\(\\s+\\d*\\)\\s+=\\s+\\(\\s+({0})\\s+({0})\\s+({0})\\),\\s+wk\\s+=\\s+({0}).+?\\n\".format(double_regex)\n\nwith open(\"pw_scf.out\") as f:\n text = f.read()\n\npattern = re.compile(regex, re.I | re.MULTILINE)\nmatch = pattern.findall(text[text.rfind(\" cryst. coord.\"):])\nkpoints = [{\"coordinates\": list(map(float, m[:3])), \"weight\": float(m[3])} for m in match]\nprint(json.dumps({\"name\": \"KPOINTS\", \"value\": kpoints, \"scope\": \"global\"}, indent=4))\n","name":"espresso_extract_kpoints.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------- #\n# This script aims to determine extrema for a given array. #\n# Please adjust the parameters according to your data. #\n# Note: This template expects the array to be defined in the #\n# context as 'array_from_context' (see details below). #\n# ----------------------------------------------------------- #\nimport numpy as np\nfrom scipy.signal import find_peaks\nimport json\nfrom munch import Munch\n\n# Data From Context\n# -----------------\n# The array 'array_from_context' is a 1D list (float or int) that has to be defined in\n# a preceding assignment unit in order to be extracted from the context.\n# Example: [0.0, 1.0, 4.0, 3.0]\n# Upon rendering the following Jinja template the extracted array will be inserted.\n{% raw %}Y = np.array({{array_from_context}}){% endraw %}\n\n# Settings\n# --------\nprominence = 0.3 # required prominence in the unit of the data array\n\n# Find Extrema\n# ------------\nmax_indices, _ = find_peaks(Y, prominence=prominence)\nmin_indices, _ = find_peaks(-1 * Y, prominence=prominence)\n\nresult = {\n \"maxima\": Y[max_indices].tolist(),\n \"minima\": Y[min_indices].tolist(),\n}\n\n# print final values to standard output (STDOUT),\n# so that they can be read by a subsequent assignment unit (using value=STDOUT)\nprint(json.dumps(result, indent=4))\n","name":"find_extrema.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Example Python package requirements for the Mat3ra platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# ----------------------------------------------------------------- #\n\n\nmunch==2.5.0\nnumpy>=1.19.5\nscipy>=1.5.4\nmatplotlib>=3.0.0\n","name":"processing_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# PythonML Package Requirements for use on the Exabyte.io Platform #\n# #\n# Will be used as follows: #\n# #\n# 1. A runtime directory for this calculation is created #\n# 2. This list is used to populate a Python virtual environment #\n# 3. The virtual environment is activated #\n# 4. The Python process running the script included within this #\n# job is started #\n# #\n# For more information visit: #\n# - https://pip.pypa.io/en/stable/reference/pip_install #\n# - https://virtualenv.pypa.io/en/stable/ #\n# #\n# The package set below is a stable working set of pymatgen and #\n# all of its dependencies. Please adjust the list to include #\n# your preferred packages. #\n# #\n# ----------------------------------------------------------------- #\n\n# Python 3 packages\ncertifi==2020.12.5\nchardet==4.0.0\ncycler==0.10.0\ndecorator==4.4.2\nfuture==0.18.2\nidna==2.10\nkiwisolver==1.3.1\nmatplotlib==3.3.4\nmonty==4.0.2\nmpmath==1.2.1\nnetworkx==2.5\nnumpy==1.19.5\npalettable==3.3.0\npandas==1.1.5\nPillow==8.1.0\nplotly==4.14.3\npymatgen==2021.2.8.1\npyparsing==2.4.7\npython-dateutil==2.8.1\npytz==2021.1\nrequests==2.25.1\nretrying==1.3.3\nruamel.yaml==0.16.12\nruamel.yaml.clib==0.2.2\nscikit-learn==0.24.1\nscipy==1.5.4\nsix==1.15.0\nspglib==1.16.1\nsympy==1.7.1\ntabulate==0.8.7\nuncertainties==3.1.5\nurllib3==1.26.3\nxgboost==1.4.2;python_version>=\"3.6\"\n","name":"pyml_requirements.txt","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# General settings for PythonML jobs on the Exabyte.io Platform #\n# #\n# This file generally shouldn't be modified directly by users. #\n# The \"datafile\" and \"is_workflow_running_to_predict\" variables #\n# are defined in the head subworkflow, and are templated into #\n# this file. This helps facilitate the workflow's behavior #\n# differing whether it is in a \"train\" or \"predict\" mode. #\n# #\n# Also in this file is the \"Context\" object, which helps maintain #\n# certain Python objects between workflow units, and between #\n# predict runs. #\n# #\n# Whenever a python object needs to be stored for subsequent runs #\n# (such as in the case of a trained model), context.save() can be #\n# called to save it. The object can then be loaded again by using #\n# context.load(). #\n# ----------------------------------------------------------------- #\n\n\nimport pickle, os\n\n# ==================================================\n# Variables modified in the Important Settings menu\n# ==================================================\n# Variables in this section can (and oftentimes need to) be modified by the user in the \"Important Settings\" tab\n# of a workflow.\n\n# Target_column_name is used during training to identify the variable the model is traing to predict.\n# For example, consider a CSV containing three columns, \"Y\", \"X1\", and \"X2\". If the goal is to train a model\n# that will predict the value of \"Y,\" then target_column_name would be set to \"Y\"\ntarget_column_name = \"{{ mlSettings.target_column_name }}\"\n\n# The type of ML problem being performed. Can be either \"regression\", \"classification,\" or \"clustering.\"\nproblem_category = \"{{ mlSettings.problem_category }}\"\n\n# =============================\n# Non user-modifiable variables\n# =============================\n# Variables in this section generally do not need to be modified.\n\n# The problem category, regression or classification or clustering. In regression, the target (predicted) variable\n# is continues. In classification, it is categorical. In clustering, there is no target - a set of labels is\n# automatically generated.\nis_regression = is_classification = is_clustering = False\nif problem_category.lower() == \"regression\":\n is_regression = True\nelif problem_category.lower() == \"classification\":\n is_classification = True\nelif problem_category.lower() == \"clustering\":\n is_clustering = True\nelse:\n raise ValueError(\n \"Variable 'problem_category' must be either 'regression', 'classification', or 'clustering'. Check settings.py\")\n\n# The variables \"is_workflow_running_to_predict\" and \"is_workflow_running_to_train\" are used to control whether\n# the workflow is in a \"training\" mode or a \"prediction\" mode. The \"IS_WORKFLOW_RUNNING_TO_PREDICT\" variable is set by\n# an assignment unit in the \"Set Up the Job\" subworkflow that executes at the start of the job. It is automatically\n# changed when the predict workflow is generated, so users should not need to modify this variable.\nis_workflow_running_to_predict = {% raw %}{{IS_WORKFLOW_RUNNING_TO_PREDICT}}{% endraw %}\nis_workflow_running_to_train = not is_workflow_running_to_predict\n\n# Sets the datafile variable. The \"datafile\" is the data that will be read in, and will be used by subsequent\n# workflow units for either training or prediction, depending on the workflow mode.\nif is_workflow_running_to_predict:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\nelse:\n datafile = \"{% raw %}{{DATASET_BASENAME}}{% endraw %}\"\n\n# The \"Context\" class allows for data to be saved and loaded between units, and between train and predict runs.\n# Variables which have been saved using the \"Save\" method are written to disk, and the predict workflow is automatically\n# configured to obtain these files when it starts.\n#\n# IMPORTANT NOTE: Do *not* adjust the value of \"context_dir_pathname\" in the Context object. If the value is changed, then\n# files will not be correctly copied into the generated predict workflow. This will cause the predict workflow to be\n# generated in a broken state, and it will not be able to make any predictions.\nclass Context(object):\n \"\"\"\n Saves and loads objects from the disk, useful for preserving data between workflow units\n\n Attributes:\n context_paths (dict): Dictionary of the format {variable_name: path}, that governs where\n pickle saves files.\n\n Methods:\n save: Used to save objects to the context directory\n load: Used to load objects from the context directory\n \"\"\"\n\n def __init__(self, context_file_basename=\"workflow_context_file_mapping\"):\n \"\"\"\n Constructor for Context objects\n\n Args:\n context_file_basename (str): Name of the file to store context paths in\n \"\"\"\n\n # Warning: DO NOT modify the context_dir_pathname variable below\n # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n context_dir_pathname = \"{% raw %}{{ CONTEXT_DIR_RELATIVE_PATH }}{% endraw %}\"\n # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n self._context_dir_pathname = context_dir_pathname\n self._context_file = os.path.join(context_dir_pathname, context_file_basename)\n\n # Make context dir if it does not exist\n if not os.path.exists(context_dir_pathname):\n os.makedirs(context_dir_pathname)\n\n # Read in the context sources dictionary, if it exists\n if os.path.exists(self._context_file):\n with open(self._context_file, \"rb\") as file_handle:\n self.context_paths: dict = pickle.load(file_handle)\n else:\n # Items is a dictionary of {varname: path}\n self.context_paths = {}\n\n def __enter__(self):\n return self\n\n def __exit__(self, exc_type, exc_value, traceback):\n self._update_context()\n\n def __contains__(self, item):\n return item in self.context_paths\n\n def _update_context(self):\n with open(self._context_file, \"wb\") as file_handle:\n pickle.dump(self.context_paths, file_handle)\n\n def load(self, name: str):\n \"\"\"\n Returns a contextd object\n\n Args:\n name (str): The name in self.context_paths of the object\n \"\"\"\n path = self.context_paths[name]\n with open(path, \"rb\") as file_handle:\n obj = pickle.load(file_handle)\n return obj\n\n def save(self, obj: object, name: str):\n \"\"\"\n Saves an object to disk using pickle\n\n Args:\n name (str): Friendly name for the object, used for lookup in load() method\n obj (object): Object to store on disk\n \"\"\"\n path = os.path.join(self._context_dir_pathname, f\"{name}.pkl\")\n self.context_paths[name] = path\n with open(path, \"wb\") as file_handle:\n pickle.dump(obj, file_handle)\n self._update_context()\n\n# Generate a context object, so that the \"with settings.context\" can be used by other units in this workflow.\ncontext = Context()\n\nis_using_train_test_split = \"is_using_train_test_split\" in context and (context.load(\"is_using_train_test_split\"))\n\n# Create a Class for a DummyScaler()\nclass DummyScaler:\n \"\"\"\n This class is a 'DummyScaler' which trivially acts on data by returning it unchanged.\n \"\"\"\n\n def fit(self, X):\n return self\n\n def transform(self, X):\n return X\n\n def fit_transform(self, X):\n return X\n\n def inverse_transform(self, X):\n return X\n\nif 'target_scaler' not in context:\n context.save(DummyScaler(), 'target_scaler')\n","name":"pyml_settings.py","contextProviders":[{"name":"MLSettingsDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Custom workflow unit template for the Exabyte.io platform #\n# #\n# This file imports a set of workflow-specific context variables #\n# from settings.py. It then uses a context manager to save and #\n# load Python objects. When saved, these objects can then be #\n# loaded either later in the same workflow, or by subsequent #\n# predict jobs. #\n# #\n# Any pickle-able Python object can be saved using #\n# settings.context. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport settings\n\n# The context manager exists to facilitate\n# saving and loading objects across Python units within a workflow.\n\n# To load an object, simply do to \\`context.load(\"name-of-the-saved-object\")\\`\n# To save an object, simply do \\`context.save(\"name-for-the-object\", object_here)\\`\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Do some transformations to the data here\n\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n descriptors = context.load(\"descriptors\")\n\n # Do some predictions or transformation to the data here\n","name":"pyml_custom.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to read in data for the ML workflow. #\n# #\n# Also showcased here is the concept of branching based on #\n# whether the workflow is in \"train\" or \"predict\" mode. #\n# #\n# If the workflow is in \"training\" mode, it will read in the data #\n# before converting it to a Numpy array and save it for use #\n# later. During training, we already have values for the output, #\n# and this gets saved to \"target.\" #\n# #\n# Finally, whether the workflow is in training or predict mode, #\n# it will always read in a set of descriptors from a datafile #\n# defined in settings.py #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport sklearn.preprocessing\nimport settings\n\nwith settings.context as context:\n data = pandas.read_csv(settings.datafile)\n\n # Train\n # By default, we don't do train/test splitting: the train and test represent the same dataset at first.\n # Other units (such as a train/test splitter) down the line can adjust this as-needed.\n if settings.is_workflow_running_to_train:\n\n # Handle the case where we are clustering\n if settings.is_clustering:\n target = data.to_numpy()[:, 0] # Just get the first column, it's not going to get used anyway\n else:\n target = data.pop(settings.target_column_name).to_numpy()\n\n # Handle the case where we are classifying. In this case, we must convert any labels provided to be categorical.\n # Specifically, labels are encoded with values between 0 and (N_Classes - 1)\n if settings.is_classification:\n label_encoder = sklearn.preprocessing.LabelEncoder()\n target = label_encoder.fit_transform(target)\n context.save(label_encoder, \"label_encoder\")\n\n target = target.reshape(-1, 1) # Reshape array from a row vector into a column vector\n\n context.save(target, \"train_target\")\n context.save(target, \"test_target\")\n\n descriptors = data.to_numpy()\n\n context.save(descriptors, \"train_descriptors\")\n context.save(descriptors, \"test_descriptors\")\n\n else:\n descriptors = data.to_numpy()\n context.save(descriptors, \"descriptors\")\n","name":"data_input_read_csv_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow Unit to perform a train/test split #\n# #\n# Splits the dataset into a training and testing set. The #\n# variable `percent_held_as_test` controls how much of the #\n# input dataset is removed for use as a testing set. By default, #\n# this unit puts 20% of the dataset into the testing set, and #\n# places the remaining 80% into the training set. #\n# #\n# Does nothing in the case of predictions. #\n# #\n# ----------------------------------------------------------------- #\n\nimport sklearn.model_selection\nimport numpy as np\nimport settings\n\n# `percent_held_as_test` is the amount of the dataset held out as the testing set. If it is set to 0.2,\n# then 20% of the dataset is held out as a testing set. The remaining 80% is the training set.\npercent_held_as_test = {{ mlTrainTestSplit.fraction_held_as_test_set }}\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Load training data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n\n # Combine datasets to facilitate train/test split\n\n # Do train/test split\n train_descriptors, test_descriptors, train_target, test_target = sklearn.model_selection.train_test_split(\n train_descriptors, train_target, test_size=percent_held_as_test)\n\n # Set the flag for using a train/test split\n context.save(True, \"is_using_train_test_split\")\n\n # Save training data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"data_input_train_test_split_sklearn.py","contextProviders":[{"name":"MLTrainTestSplitDataManager"}],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn MinMax Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it is on interval #\n# [0,1]. It then saves the data for use further down #\n# the road in the workflow, for use in un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the min and max of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor MinMax Scaler\n scaler = sklearn.preprocessing.MinMaxScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_min_max_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Duplicates workflow unit #\n# #\n# This workflow unit drops all duplicate rows, if it is running #\n# in the \"train\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop duplicates from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n df = df.drop_duplicates()\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop duplicates from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.drop_duplicates()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_duplicates_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Pandas Remove Missing Workflow Unit #\n# #\n# This workflow unit allows missing rows and/or columns to be #\n# dropped from the dataset by configuring the `to_drop` #\n# parameter. #\n# #\n# Valid values for `to_drop`: #\n# - \"rows\": rows with missing values will be removed #\n# - \"columns\": columns with missing values will be removed #\n# - \"both\": rows and columns with missing values will be removed #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport pandas\nimport settings\n\n# `to_drop` can either be \"rows\" or \"columns\"\n# If it is set to \"rows\" (by default), then all rows with missing values will be dropped.\n# If it is set to \"columns\", then all columns with missing values will be dropped.\n# If it is set to \"both\", then all rows and columns with missing values will be dropped.\nto_drop = \"rows\"\n\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Drop missing from the training set\n df = pandas.DataFrame(train_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(train_descriptors))\n\n directions = {\n \"rows\": (\"index\",),\n \"columns\": (\"columns\",),\n \"both\": (\"index\", \"columns\"),\n }[to_drop]\n for direction in directions:\n df = df.dropna(direction)\n\n train_target = df.pop(\"target\").to_numpy()\n train_target = train_target.reshape(-1, 1)\n train_descriptors = df.to_numpy()\n\n # Drop missing from the testing set\n df = pandas.DataFrame(test_target, columns=[\"target\"])\n df = df.join(pandas.DataFrame(test_descriptors))\n df = df.dropna()\n test_target = df.pop(\"target\").to_numpy()\n test_target = test_target.reshape(-1, 1)\n test_descriptors = df.to_numpy()\n\n # Store the data\n context.save(train_target, \"train_target\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_target, \"test_target\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Predict\n else:\n pass\n","name":"pre_processing_remove_missing_pandas.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Sklearn Standard Scaler workflow unit #\n# #\n# This workflow unit scales the data such that it a mean of 0 and #\n# a standard deviation of 1. It then saves the data for use #\n# further down the road in the workflow, for use in #\n# un-transforming the data. #\n# #\n# It is important that new predictions are made by scaling the #\n# new inputs using the mean and variance of the original training #\n# set. As a result, the scaler gets saved in the Training phase. #\n# #\n# During a predict workflow, the scaler is loaded, and the #\n# new examples are scaled using the stored scaler. #\n# ----------------------------------------------------------------- #\n\n\nimport sklearn.preprocessing\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Descriptor Scaler\n scaler = sklearn.preprocessing.StandardScaler\n descriptor_scaler = scaler()\n train_descriptors = descriptor_scaler.fit_transform(train_descriptors)\n test_descriptors = descriptor_scaler.transform(test_descriptors)\n context.save(descriptor_scaler, \"descriptor_scaler\")\n context.save(train_descriptors, \"train_descriptors\")\n context.save(test_descriptors, \"test_descriptors\")\n\n # Our target is only continuous if it's a regression problem\n if settings.is_regression:\n target_scaler = scaler()\n train_target = target_scaler.fit_transform(train_target)\n test_target = target_scaler.transform(test_target)\n context.save(target_scaler, \"target_scaler\")\n context.save(train_target, \"train_target\")\n context.save(test_target, \"test_target\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Get the scaler\n descriptor_scaler = context.load(\"descriptor_scaler\")\n\n # Scale the data\n descriptors = descriptor_scaler.transform(descriptors)\n\n # Store the data\n context.save(descriptors, \"descriptors\")\n","name":"pre_processing_standardization_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge-regression model in Scikit-Learn. #\n# Alpha is taken from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.AdaBoostRegressor(\n n_estimators=50,\n learning_rate=1,\n loss=\"linear\",\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"adaboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"adaboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_adaboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a bagged trees regression model with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.tree\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Base Estimator\n base_estimator = sklearn.tree.DecisionTreeRegressor(\n criterion=\"mse\",\n splitter=\"best\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=None,\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n ccp_alpha=0.0,\n )\n\n # Initialize the Model\n model = sklearn.ensemble.BaggingRegressor(\n n_estimators=10,\n max_samples=1.0,\n max_features=1.0,\n bootstrap=True,\n bootstrap_features=False,\n oob_score=False,\n verbose=0,\n base_estimator=base_estimator,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"bagged_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"bagged_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_bagged_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for gradient-boosted tree regression with #\n# Scikit-Learn. Parameters for the estimator and ensemble are #\n# derived from Scikit-Learn's Defaults. Note: In the gradient- #\n# boosted trees ensemble used, the weak learners used as #\n# estimators cannot be tuned with the same level of fidelity #\n# allowed in the adaptive-boosted trees ensemble. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.GradientBoostingRegressor(\n loss=\"ls\",\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion=\"friedman_mse\",\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n max_features=None,\n alpha=0.9,\n verbose=0,\n max_leaf_nodes=None,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees regression #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBRegressor(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='reg:squarederror',\n eval_metric='rmse')\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_regression\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_regression\")\n\n # Make some predictions and unscale\n predictions = model.predict(descriptors)\n predictions = predictions.reshape(-1, 1)\n target_scaler = context.load(\"target_scaler\")\n\n predictions = target_scaler.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\")\n","name":"model_extreme_gradboosted_trees_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for k-means clustering. #\n# #\n# In k-means clustering, the labels are not provided ahead of #\n# time. Instead, one supplies the number of groups the #\n# algorithm should split the dataset into. Here, we set our #\n# own default of 4 groups (fewer than sklearn's default of 8). #\n# Otherwise, the default parameters of the clustering method #\n# are the same as in sklearn. #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.cluster\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Initialize the Model\n model = sklearn.cluster.KMeans(\n n_clusters=4,\n init=\"k-means++\",\n n_init=10,\n max_iter=300,\n tol=0.0001,\n copy_x=True,\n algorithm=\"auto\",\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors)\n context.save(model, \"k_means\")\n train_labels = model.predict(train_descriptors)\n test_labels = model.predict(test_descriptors)\n\n context.save(train_labels, \"train_labels\")\n context.save(test_labels, \"test_labels\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"k_means\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_k_means_clustering_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a kernelized ridge-regression model with #\n# Scikit-Learn. Model parameters are derived from Scikit- #\n# Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.kernel_ridge\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.kernel_ridge.KernelRidge(\n alpha=1.0,\n kernel=\"linear\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"kernel_ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"kernel_ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_kernel_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a LASSO-regression model with Scikit- #\n# Learn. Model parameters derived from Scikit-Learn's #\n# Defaults. Alpha has been lowered from the default of 1.0, to #\n# 0.1. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Lasso(\n alpha=0.1,\n fit_intercept=True,\n normalize=False,\n precompute=False,\n tol=0.0001,\n positive=True,\n selection=\"cyclic\",\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"LASSO\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"LASSO\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_lasso_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit to train a simple feedforward neural network #\n# model on a regression problem using scikit-learn. In this #\n# template, we use the default values for hidden_layer_sizes, #\n# activation, solver, and learning rate. Other parameters are #\n# available (consult the sklearn docs), but in this case, we #\n# only include those relevant to the Adam optimizer. Sklearn #\n# Docs: Sklearn docs:http://scikit-learn.org/stable/modules/ge #\n# nerated/sklearn.neural_network.MLPRegressor.html #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.neural_network\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.neural_network.MLPRegressor(\n hidden_layer_sizes=(100,),\n activation=\"relu\",\n solver=\"adam\",\n max_iter=300,\n early_stopping=False,\n validation_fraction=0.1,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"multilayer_perceptron\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"multilayer_perceptron\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_mlp_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a random forest classification model with #\n# Scikit-Learn. Parameters derived from Scikit-Learn's #\n# defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. When #\n# the workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a filee named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestClassifier(\n n_estimators=100,\n criterion=\"gini\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n oob_score=False,\n verbose=0,\n class_weight=None,\n ccp_alpha=0.0,\n max_samples=None,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target, test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for a gradient boosted classification model with #\n# Scikit-Learn. Parameters derived from sklearn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = sklearn.ensemble.GradientBoostingClassifier(loss='deviance',\n learning_rate=0.1,\n n_estimators=100,\n subsample=1.0,\n criterion='friedman_mse',\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_depth=3,\n min_impurity_decrease=0.0,\n min_impurity_split=None,\n init=None,\n random_state=None,\n max_features=None,\n verbose=0,\n max_leaf_nodes=None,\n warm_start=False,\n validation_fraction=0.1,\n n_iter_no_change=None,\n tol=0.0001,\n ccp_alpha=0.0)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"gradboosted_trees_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"gradboosted_trees_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Workflow unit for eXtreme Gradient-Boosted trees classification #\n# with XGBoost's wrapper to Scikit-Learn. Parameters for the #\n# estimator and ensemble are derived from sklearn defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the confusion matrix. #\n# #\n# When the workflow is run in Predict mode, the model is #\n# loaded, predictions are made, they are un-transformed using #\n# the trained scaler from the training run, and they are #\n# written to a filed named \"predictions.csv\" #\n# ----------------------------------------------------------------- #\n\nimport xgboost\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_target = context.load(\"test_target\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the model\n model = xgboost.XGBClassifier(booster='gbtree',\n verbosity=1,\n learning_rate=0.3,\n min_split_loss=0,\n max_depth=6,\n min_child_weight=1,\n max_delta_step=0,\n colsample_bytree=1,\n reg_lambda=1,\n reg_alpha=0,\n scale_pos_weight=1,\n objective='binary:logistic',\n eval_metric='logloss',\n use_label_encoder=False)\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"extreme_gradboosted_tree_classification\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Save the probabilities of the model\n\n test_probabilities = model.predict_proba(test_descriptors)\n context.save(test_probabilities, \"test_probabilities\")\n\n # Print some information to the screen for the regression problem\n confusion_matrix = sklearn.metrics.confusion_matrix(test_target,\n test_predictions)\n print(\"Confusion Matrix:\")\n print(confusion_matrix)\n context.save(confusion_matrix, \"confusion_matrix\")\n\n # Ensure predictions have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"extreme_gradboosted_tree_classification\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Transform predictions back to their original labels\n label_encoder: sklearn.preprocessing.LabelEncoder = context.load(\"label_encoder\")\n predictions = label_encoder.inverse_transform(predictions)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_extreme_gradboosted_trees_classification_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow for a random forest regression model with Scikit- #\n# Learn. Parameters are derived from Scikit-Learn's defaults. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.ensemble\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.ensemble.RandomForestRegressor(\n n_estimators=100,\n criterion=\"mse\",\n max_depth=None,\n min_samples_split=2,\n min_samples_leaf=1,\n min_weight_fraction_leaf=0.0,\n max_features=\"auto\",\n max_leaf_nodes=None,\n min_impurity_decrease=0.0,\n bootstrap=True,\n max_samples=None,\n oob_score=False,\n ccp_alpha=0.0,\n verbose=0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"random_forest\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"random_forest\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_random_forest_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ------------------------------------------------------------ #\n# Workflow unit for a ridge regression model with Scikit- #\n# Learn. Alpha is taken from Scikit-Learn's default #\n# parameters. #\n# #\n# When then workflow is in Training mode, the model is trained #\n# and then it is saved, along with the RMSE and some #\n# predictions made using the training data (e.g. for use in a #\n# parity plot or calculation of other error metrics). When the #\n# workflow is run in Predict mode, the model is loaded, #\n# predictions are made, they are un-transformed using the #\n# trained scaler from the training run, and they are written #\n# to a file named \"predictions.csv\" #\n# ------------------------------------------------------------ #\n\n\nimport sklearn.linear_model\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n test_target = context.load(\"test_target\")\n train_descriptors = context.load(\"train_descriptors\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Flatten the targets\n train_target = train_target.flatten()\n test_target = test_target.flatten()\n\n # Initialize the Model\n model = sklearn.linear_model.Ridge(\n alpha=1.0,\n )\n\n # Train the model and save\n model.fit(train_descriptors, train_target)\n context.save(model, \"ridge\")\n train_predictions = model.predict(train_descriptors)\n test_predictions = model.predict(test_descriptors)\n\n # Scale predictions so they have the same shape as the saved target\n train_predictions = train_predictions.reshape(-1, 1)\n test_predictions = test_predictions.reshape(-1, 1)\n\n # Scale for RMSE calc on the test set\n target_scaler = context.load(\"target_scaler\")\n\n # Unflatten the target\n test_target = test_target.reshape(-1, 1)\n y_true = target_scaler.inverse_transform(test_target)\n y_pred = target_scaler.inverse_transform(test_predictions)\n\n # RMSE\n mse = sklearn.metrics.mean_squared_error(y_true, y_pred)\n rmse = np.sqrt(mse)\n print(f\"RMSE = {rmse}\")\n context.save(rmse, \"RMSE\")\n\n context.save(train_predictions, \"train_predictions\")\n context.save(test_predictions, \"test_predictions\")\n\n # Predict\n else:\n # Restore data\n descriptors = context.load(\"descriptors\")\n\n # Restore model\n model = context.load(\"ridge\")\n\n # Make some predictions\n predictions = model.predict(descriptors)\n\n # Save the predictions to file\n np.savetxt(\"predictions.csv\", predictions, header=\"prediction\", comments=\"\", fmt=\"%s\")\n","name":"model_ridge_regression_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Parity plot generation unit #\n# #\n# This unit generates a parity plot based on the known values #\n# in the training data, and the predicted values generated #\n# using the training data. #\n# #\n# Because this metric compares predictions versus a ground truth, #\n# it doesn't make sense to generate the plot when a predict #\n# workflow is being run (because in that case, we generally don't #\n# know the ground truth for the values being predicted). Hence, #\n# this unit does nothing if the workflow is in \"predict\" mode. #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\n\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_target = context.load(\"train_target\")\n train_predictions = context.load(\"train_predictions\")\n test_target = context.load(\"test_target\")\n test_predictions = context.load(\"test_predictions\")\n\n # Un-transform the data\n target_scaler = context.load(\"target_scaler\")\n train_target = target_scaler.inverse_transform(train_target)\n train_predictions = target_scaler.inverse_transform(train_predictions)\n test_target = target_scaler.inverse_transform(test_target)\n test_predictions = target_scaler.inverse_transform(test_predictions)\n\n # Plot the data\n plt.scatter(train_target, train_predictions, c=\"#203d78\", label=\"Training Set\")\n if settings.is_using_train_test_split:\n plt.scatter(test_target, test_predictions, c=\"#67ac5b\", label=\"Testing Set\")\n plt.xlabel(\"Actual Value\")\n plt.ylabel(\"Predicted Value\")\n\n # Scale the plot\n target_range = (min(min(train_target), min(test_target)),\n max(max(train_target), max(test_target)))\n predictions_range = (min(min(train_predictions), min(test_predictions)),\n max(max(train_predictions), max(test_predictions)))\n\n limits = (min(min(target_range), min(target_range)),\n max(max(predictions_range), max(predictions_range)))\n plt.xlim = (limits[0], limits[1])\n plt.ylim = (limits[0], limits[1])\n\n # Draw a parity line, as a guide to the eye\n plt.plot((limits[0], limits[1]), (limits[0], limits[1]), c=\"black\", linestyle=\"dotted\", label=\"Parity\")\n plt.legend()\n\n # Save the figure\n plt.tight_layout()\n plt.savefig(\"my_parity_plot.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_parity_plot_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# Cluster Visualization #\n# #\n# This unit takes an N-dimensional feature space, and uses #\n# Principal-component Analysis (PCA) to project into a 2D space #\n# to facilitate plotting on a scatter plot. #\n# #\n# The 2D space we project into are the first two principal #\n# components identified in PCA, which are the two vectors with #\n# the highest variance. #\n# #\n# Wikipedia Article on PCA: #\n# https://en.wikipedia.org/wiki/Principal_component_analysis #\n# #\n# We then plot the labels assigned to the train an test set, #\n# and color by class. #\n# #\n# ----------------------------------------------------------------- #\n\nimport pandas as pd\nimport matplotlib.cm\nimport matplotlib.lines\nimport matplotlib.pyplot as plt\nimport sklearn.decomposition\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n train_labels = context.load(\"train_labels\")\n train_descriptors = context.load(\"train_descriptors\")\n test_labels = context.load(\"test_labels\")\n test_descriptors = context.load(\"test_descriptors\")\n\n # Unscale the descriptors\n descriptor_scaler = context.load(\"descriptor_scaler\")\n train_descriptors = descriptor_scaler.inverse_transform(train_descriptors)\n test_descriptors = descriptor_scaler.inverse_transform(test_descriptors)\n\n # We need at least 2 dimensions, exit if the dataset is 1D\n if train_descriptors.ndim < 2:\n raise ValueError(\"The train descriptors do not have enough dimensions to be plot in 2D\")\n\n # The data could be multidimensional. Let's do some PCA to get things into 2 dimensions.\n pca = sklearn.decomposition.PCA(n_components=2)\n train_descriptors = pca.fit_transform(train_descriptors)\n test_descriptors = pca.transform(test_descriptors)\n xlabel = \"Principle Component 1\"\n ylabel = \"Principle Component 2\"\n\n # Determine the labels we're going to be using, and generate their colors\n labels = set(train_labels)\n colors = {}\n for count, label in enumerate(labels):\n cm = matplotlib.cm.get_cmap('jet', len(labels))\n color = cm(count / len(labels))\n colors[label] = color\n train_colors = [colors[label] for label in train_labels]\n test_colors = [colors[label] for label in test_labels]\n\n # Train / Test Split Visualization\n plt.title(\"Train Test Split Visualization\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=\"#33548c\", marker=\"o\", label=\"Training Set\")\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=\"#F0B332\", marker=\"o\", label=\"Testing Set\")\n xmin, xmax, ymin, ymax = plt.axis()\n plt.legend()\n plt.tight_layout()\n plt.savefig(\"train_test_split.png\", dpi=600)\n plt.close()\n\n def clusters_legend(cluster_colors):\n \"\"\"\n Helper function that creates a legend, given the coloration by clusters.\n Args:\n cluster_colors: A dictionary of the form {cluster_number : color_value}\n\n Returns:\n None; just creates the legend and puts it on the plot\n \"\"\"\n legend_symbols = []\n for group, color in cluster_colors.items():\n label = f\"Cluster {group}\"\n legend_symbols.append(matplotlib.lines.Line2D([], [], color=color, marker=\"o\",\n linewidth=0, label=label))\n plt.legend(handles=legend_symbols)\n\n # Training Set Clusters\n plt.title(\"Training Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(train_descriptors[:, 0], train_descriptors[:, 1], c=train_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"train_clusters.png\", dpi=600)\n plt.close()\n\n # Testing Set Clusters\n plt.title(\"Testing Set Clusters\")\n plt.xlabel(xlabel)\n plt.ylabel(ylabel)\n plt.xlim(xmin, xmax)\n plt.ylim(ymin, ymax)\n plt.scatter(test_descriptors[:, 0], test_descriptors[:, 1], c=test_colors)\n clusters_legend(colors)\n plt.tight_layout()\n plt.savefig(\"test_clusters.png\", dpi=600)\n plt.close()\n\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_pca_2d_clusters_matplotlib.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"# ----------------------------------------------------------------- #\n# #\n# ROC Curve Generator #\n# #\n# Computes and displays the Receiver Operating Characteristic #\n# (ROC) curve. This is restricted to binary classification tasks. #\n# #\n# ----------------------------------------------------------------- #\n\n\nimport matplotlib.pyplot as plt\nimport matplotlib.collections\nimport sklearn.metrics\nimport numpy as np\nimport settings\n\nwith settings.context as context:\n # Train\n if settings.is_workflow_running_to_train:\n # Restore the data\n test_target = context.load(\"test_target\").flatten()\n # Slice the first column because Sklearn's ROC curve prefers probabilities for the positive class\n test_probabilities = context.load(\"test_probabilities\")[:, 1]\n\n # Exit if there's more than one label in the predictions\n if len(set(test_target)) > 2:\n exit()\n\n # ROC curve function in sklearn prefers the positive class\n false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(test_target, test_probabilities,\n pos_label=1)\n thresholds[0] -= 1 # Sklearn arbitrarily adds 1 to the first threshold\n roc_auc = np.round(sklearn.metrics.auc(false_positive_rate, true_positive_rate), 3)\n\n # Plot the curve\n fig, ax = plt.subplots()\n points = np.array([false_positive_rate, true_positive_rate]).T.reshape(-1, 1, 2)\n segments = np.concatenate([points[:-1], points[1:]], axis=1)\n norm = plt.Normalize(thresholds.min(), thresholds.max())\n lc = matplotlib.collections.LineCollection(segments, cmap='jet', norm=norm, linewidths=2)\n lc.set_array(thresholds)\n line = ax.add_collection(lc)\n fig.colorbar(line, ax=ax).set_label('Threshold')\n\n # Padding to ensure we see the line\n ax.margins(0.01)\n\n plt.title(f\"ROC curve, AUC={roc_auc}\")\n plt.xlabel(\"False Positive Rate\")\n plt.ylabel(\"True Positive Rate\")\n plt.tight_layout()\n plt.savefig(\"my_roc_curve.png\", dpi=600)\n\n # Predict\n else:\n # It might not make as much sense to draw a plot when predicting...\n pass\n","name":"post_processing_roc_curve_sklearn.py","contextProviders":[],"applicationName":"python","executableName":"python"},{"content":"#!/bin/bash\n# ---------------------------------------------------------------- #\n# #\n# Example shell script for Exabyte.io platform. #\n# #\n# Will be used as follows: #\n# #\n# 1. shebang line is read from the first line above #\n# 2. based on shebang one of the shell types is selected: #\n# - /bin/bash #\n# - /bin/csh #\n# - /bin/tclsh #\n# - /bin/tcsh #\n# - /bin/zsh #\n# 3. runtime directory for this calculation is created #\n# 4. the content of the script is executed #\n# #\n# Adjust the content below to include your code. #\n# #\n# ---------------------------------------------------------------- #\n\necho \"Hello world!\"\n","name":"hello_world.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ---------------------------------------------------------------- #\n# #\n# Example job submission script for Exabyte.io platform #\n# #\n# Shows resource manager directives for: #\n# #\n# 1. the name of the job (-N) #\n# 2. the number of nodes to be used (-l nodes=) #\n# 3. the number of processors per node (-l ppn=) #\n# 4. the walltime in dd:hh:mm:ss format (-l walltime=) #\n# 5. queue (-q) D, OR, OF, SR, SF #\n# 6. merging standard output and error (-j oe) #\n# 7. email about job abort, begin, end (-m abe) #\n# 8. email address to use (-M) #\n# #\n# For more information visit https://docs.exabyte.io/cli/jobs #\n# ---------------------------------------------------------------- #\n\n#PBS -N ESPRESSO-TEST\n#PBS -j oe\n#PBS -l nodes=1\n#PBS -l ppn=1\n#PBS -l walltime=00:00:10:00\n#PBS -q D\n#PBS -m abe\n#PBS -M info@exabyte.io\n\n# load module\nmodule add espresso/540-i-174-impi-044\n\n# go to the job working directory\ncd $PBS_O_WORKDIR\n\n# create input file\ncat > pw.in < pw.out\n","name":"job_espresso_pw_scf.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\nmkdir -p {{ JOB_SCRATCH_DIR }}/outdir/_ph0\ncd {{ JOB_SCRATCH_DIR }}/outdir\ncp -r {{ JOB_WORK_DIR }}/../outdir/__prefix__.* .\n{%- endraw -%}\n","name":"espresso_link_outdir_save.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"{%- raw -%}\n#!/bin/bash\n\ncp {{ JOB_SCRATCH_DIR }}/outdir/_ph0/__prefix__.phsave/dynmat* {{ JOB_WORK_DIR }}/../outdir/_ph0/__prefix__.phsave\n{%- endraw -%}\n","name":"espresso_collect_dynmat.sh","contextProviders":[],"applicationName":"shell","executableName":"sh"},{"content":"#!/bin/bash\n\n# ------------------------------------------------------------------ #\n# This script prepares necessary directories to run VASP NEB\n# calculation. It puts initial POSCAR into directory 00, final into 0N\n# and intermediate images in 01 to 0(N-1). It is assumed that SCF\n# calculations for initial and final structures are already done in\n# previous subworkflows and their standard outputs are written into\n# \"vasp_neb_initial.out\" and \"vasp_neb_final.out\" files respectively.\n# These outputs are here copied into initial (00) and final (0N)\n# directories to calculate the reaction energy profile.\n# ------------------------------------------------------------------ #\n\n{% raw -%}\ncd {{ JOB_WORK_DIR }}\n{%- endraw %}\n\n# Prepare First Directory\nmkdir -p 00\ncat > 00/POSCAR < 0{{ input.INTERMEDIATE_IMAGES.length + 1 }}/POSCAR < 0{{ loop.index }}/POSCAR < { templates = getAllAppTemplates(); }); + it("exists at least 1 asset for each tree entry for deepmd tree", () => { + const tree = APP_TREE.deepmd; + + Object.keys(tree).forEach((treeItemName) => { + const treeItemTemplates = templates.filter( + (template) => template.executableName === treeItemName, + ); + + assert(treeItemTemplates.length > 0); + }); + }); + it("exists at least 1 asset for each tree entry for espresso tree", () => { const tree = APP_TREE.espresso;