Skip to content

Commit

Permalink
many simrun fixes; fix for crash in plot.
Browse files Browse the repository at this point in the history
  • Loading branch information
rcoreilly committed Jan 11, 2025
1 parent c742f95 commit 2babefc
Show file tree
Hide file tree
Showing 15 changed files with 510 additions and 138 deletions.
10 changes: 10 additions & 0 deletions examples/simrun/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# simrun: simulation running infrastructure



# Configuring a new linux compute server

```sh
sudo apt install golang gcc libgl1-mesa-dev libegl1-mesa-dev mesa-vulkan-drivers xorg-dev vulkan-tools nvidia-driver-565-server
```

102 changes: 102 additions & 0 deletions examples/simrun/barerun.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

102 changes: 102 additions & 0 deletions examples/simrun/barerun.goal
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright (c) 2024, Cogent Lab. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
"fmt"
"io"
"strings"
"os"

"cogentcore.org/core/core"
"cogentcore.org/lab/goal/goalib"
)

func (br *SimRun) WriteRun(w io.Writer, jid, args string, gpuno int) {
fmt.Fprintf(w, "#!/bin/bash -l\n") // -l = login session, sources your .bash_profile
// fmt.Fprint(w, "#SBATCH --output=job.setup.out\n")
// fmt.Fprint(w, "#SBATCH --error=job.setup.err\n")

fmt.Fprintf(w, "\n\n")
// fmt.Fprintf(w, "go build -mod=mod -tags mpi\n")
fmt.Fprintf(w, "go build -mod=mod\n")
fmt.Fprintf(w, "sleep 2\n")
fmt.Fprintln(w, "date '+%Y-%m-%d %T %Z' > job.start")

gpus := br.Config.Server.GPUIDs
// ngpu := len(gpus)
// cgpu := 0

// for i := range br.Config.Job.NRuns {
// fmt.Fprintf(w, "GPU_DEVICE=%d ./%s -nogui -cfg config_job.toml -run %d -runs 1 %s &\n", gpus[gpuno], br.Config.Project, i, args)
// cmd := `nohup bash -c "(./%s -nogui -cfg config_job.toml -gpu-device %d %s) &> job.out" &`
// cmd += "\n"
// this actually works, along with the nohup at submission -- need to get process id here
cmd := "./%s -nogui -cfg config_job.toml -gpu-device %d %s &> job.out &"
fmt.Fprintf(w, cmd, br.Config.Project, gpus[gpuno], args)
// cgpu = (cgpu + 1) % ngpu
// }
}

func (br *SimRun) SubmitRun(jid, args string, gpuno int) string {
@0
f, _ := os.Create("job.sbatch")
br.WriteRun(f, jid, args, gpuno)
f.Close()
scp job.sbatch @1:job.sbatch
sid := br.BareRun("job.sbatch", args, gpuno)
return sid
}

// BareRun runs batch job on the given batch file,
// returning the resulting job id.
func (br *SimRun) BareRun(sbatch string, args string, gpuno int) string {
@1
chmod +x {sbatch}
// note: can't seem to get this to work:
$nohup {"./"+sbatch}$
// go build -mod=mod
// date '+%Y-%m-%d %T %Z' > job.start
// gpud := fmt.Sprintf("%d", br.Config.Server.GPUIDs[gpuno])
// fmt.Println("Running on GPU number:", gpud)
// ${"./" + br.Config.Project} -nogui -cfg config_job.toml -gpu-device {gpud} {args} >& job.out &$
@0
if br.Config.Server.Slurm {
ss := $@1 cat job.slurm$
if ss == "" {
fmt.Println("JobStatus ERROR: no server job.slurm file to get server job id from")
@1 cd
@0
return ""
}
ssf := strings.Fields(ss)
sj := ssf[len(ssf)-1]
return sj
}
return "nj"
}

// Finalize is a temporary hack to finalize the job status.
func (br *SimRun) Finalize() { //types:add
tv := br.JobsTableView
jobs := tv.SelectedColumnStrings("JobID")
if len(jobs) == 0 {
core.MessageSnackbar(br, "No jobs selected for finalizing")
return
}
for _, jid := range jobs {
jpath := br.JobPath(jid)
@0
cd {jpath}
sstat := goalib.ReadFile("job.status")
if sstat == "Fetched" {
continue
}
goalib.WriteFile("job.status", "Finalized")
br.GetMeta(jid)
}
br.UpdateSims()
}

58 changes: 40 additions & 18 deletions examples/simrun/config.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2babefc

Please sign in to comment.