Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug]: Issues with tc_analysis in zppy #622

Open
chengzhuzhang opened this issue Sep 23, 2024 · 9 comments · Fixed by #623 · May be fixed by #633
Open

[Bug]: Issues with tc_analysis in zppy #622

chengzhuzhang opened this issue Sep 23, 2024 · 9 comments · Fixed by #623 · May be fixed by #633
Assignees
Labels
semver: bug Bug fix (will increment patch version)

Comments

@chengzhuzhang
Copy link
Collaborator

chengzhuzhang commented Sep 23, 2024

What happened?

I ran into a few issues with setting up tc_analysis with a fully configured zppy run.

  1. Status file shows "RUNNING" after slurm error: slurmstepd: error: *** JOB 588238 ON chr-0471 CANCELLED AT 2024-09-20T16:27:45 DUE TO TIME LIMIT ***
  2. All three e3sm_diags runs (model vs obs, model vs model, model vs model [land=only]) are waiting for tc_analysis even though only the model vs obs task depends on tc_analysis.

What machine were you running on?

Chrysalis

Environment

e3sm_unified_1.10

What command did you run?

zppy -c

Copy your cfg file

[default]
input = /lcrc/group/e3sm2/ac.wlin/E3SMv3/v3.LR.historical_0051
output = /lcrc/group/e3sm2/ac.zhang40/E3SMv3/v3.LR.historical_0920
case = v3.LR.historical_0051
www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.zhang40/E3SMv3_0920
partition = compute
environment_commands = "source /lcrc/soft/climate/e3sm-unified/load_latest_e3sm_unified_chrysalis.sh"
#environment_commands = "source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.10.0rc5_chrysalis.sh"
campaign = "water_cycle"

[climo]
active = True
#years = "0001:0100:50", "0001:0100:100"
#years = "1985:2014:30", "1985:2014:15"
years = "1985:2014:30",
walltime = "1:00:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h3"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "PRECT"
  frequency = "diurnal_8xdaily"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = archive/lnd/hist
  vars = ""

[ts]
active = True
years = "1985:2014:30"
walltime = "00:50:00"


  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "FSNTOA,FLUT,FSNT,FLNT,FSNS,FLNS,SHFLX,QFLX,TAUX,TAUY,PRECC,PRECL,PRECSC,PRECSL,TS,TREFHT,CLDTOT,CLDHGH,CLDMED,CLDLOW,U,ICEFRAC,LANDFRAC,OCNFRAC,PS,CLDICE,CLDLIQ,T,AODDUST"
# Needed for mixed-phase partition
#  vars = "LANDFRAC,CLDICE,CLDLIQ,T" 
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h1"
  frequency = "daily"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  # Needed for Wheeler Kiladis
  vars = "FLUT,PRECT,U850"

  [[ atm_monthly_glb ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = "glb"

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  #mapping_file = ""
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILICE,SOILLIQ,SOILWATER_10CM,TSA,TSOI,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
  extra_vars = "landfrac"
  ts_fmt = "cmip"
#
  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"
  extra_vars = 'areatotal2'

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  #mapping_file = ""
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILICE,SOILLIQ,SOILWATER_10CM,TSA,TSOI,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
  extra_vars = "landfrac"
  ts_fmt = "cmip"
#
  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"
  extra_vars = 'areatotal2'

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"
  vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"

[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/$USER"
# Make walltime very short to reproduce this error
walltime = "00:10:00"
years = "1985:2014:30",



[e3sm_diags]
active = True
walltime = "4:00:00"
#years = "0001:0100:50", "0001:0100:100"
years = "1985:2014:30",
ts_num_years = 30
ref_start_yr = 1985
ref_final_yr = 2014
multiprocessing = True
num_workers = 8

  [[ atm_monthly_180x360_aave ]]
#  environment_commands = "source /home/ac.zhang40/y/etc/profile.d/conda.sh; conda activate edv2110"
  short_name = 'v3.LR.historical_0051'
  grid = '180x360_aave'
  reference_data_path = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/climatology'
  obs_ts = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/time-series'
  dc_obs_climo = '/lcrc/group/e3sm/public_html/e3sm_diags_test_data/unit_test_complete_run/obs/climatology'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  ts_daily_subsection = "atm_daily_180x360_aave"
  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_aeronet","tropical_subseasonal","tc_analysis",
#"mp_partition","aerosol_budget",
#  sets="tropical_subseasonal",
  output_format_subplot = "pdf",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  #environment_commands = "source /home/ac.zhang40/y/etc/profile.d/conda.sh; conda activate edv290"
  grid = 'native'
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  short_name = v3.LR.piControl
  ref_name = "20231209.v3.LR.piControl-spinup.chrysalis"
  ref_start_yr = 0051
  ref_final_yr = 0100
  ref_years = "0051-0100",
  reference_data_path = "/lcrc/group/e3sm/ac.zhang40/tests/20231209.v3.LR.piControl-spinup.chrysalis_land_diags/post/lnd/native/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "20231209.v3.LR.piControl-spinup"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 50

  [[atm_monthly_180x360_aave_mvm]]
  #years = "0001-0050", 
  #years = "1985-2014", 
  ref_years = "0001-0050",
  ref_start_yr = 1
  ref_final_yr = 50
  ts_num_years = 30 
  ts_num_years_ref = 10
  ts_subsection = "atm_monthly_180x360_aave"
  short_name = 'v3alpha04-COARE.piControl'
  grid = '180x360_aave'
  ref_name = '20230924.v3alpha04_trigrid.piControl.chrysalis'
  short_ref_name = 'v3alpha04-CTL.piControl'
  tag = 'v3alpha04i-COARE_vs_CTL'
  run_type = "model_vs_model"
  reference_data_path = '/lcrc/group/e3sm2/ac.xzheng/E3SMv3_dev/20230924.v3alpha04_trigrid.piControl.chrysalis/post/atm/180x360_aave/clim'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_subsection = "atm_monthly_180x360_aave"
  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_budget"
  diff_title = 'Difference'
#  output_format_subplot = "pdf",

[ilamb]
active = True 
nodes = 8
walltime = "2:00:00"
partition = compute 
short_name = 'v3.LR.historical_0051'
#ts_land_grid = 'native'
ts_num_years = 30
years = "1985:2014:30"

[global_time_series]
active = True
experiment_name = "v3.LR.historical_0051"
figstr = "v3.LR.historical_0051"
#plots_original = "net_toa_flux_restom,global_surface_air_temperature,toa_radiation,net_atm_energy_imbalance,net_atm_water_imbalance"
plots_atm = "TREFHT,AODDUST"
plots_lnd = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
ts_num_years = 30
walltime = "00:30:00"
years = "1985-2014",
climo_years ="1985-2014",
ts_years ="1985-2014",
moc_file = "mocTimeSeries_1985-2014.nc"

What jobs are failing?

No response

What stack trace are you encountering?

No response

@chengzhuzhang chengzhuzhang added the semver: bug Bug fix (will increment patch version) label Sep 23, 2024
@forsyth2
Copy link
Collaborator

Yes I ran into this issue too on main. Looking into it.

@forsyth2
Copy link
Collaborator

forsyth2 commented Sep 23, 2024

tl;dr dependencies need to be defined per-task in the .py files. I'll make a PR.


I ran zppy -c issue_622.cfg, which uses this cfg:

[default]
input = /lcrc/group/e3sm2/ac.wlin/E3SMv3/v3.LR.historical_0051
#output = /lcrc/group/e3sm2/ac.zhang40/E3SMv3/v3.LR.historical_0920
output = /lcrc/group/e3sm/ac.forsyth2/issue_622
case = v3.LR.historical_0051
#www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.zhang40/E3SMv3_0920
www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2
partition = compute
environment_commands = "source /lcrc/soft/climate/e3sm-unified/load_latest_e3sm_unified_chrysalis.sh"
campaign = "water_cycle"

[climo]
active = True
years = "1985:2014:30",
walltime = "1:00:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h3"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "PRECT"
  frequency = "diurnal_8xdaily"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = archive/lnd/hist
  vars = ""

[ts]
active = True
years = "1985:2014:30"
walltime = "00:50:00"


  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "FSNTOA,FLUT,FSNT,FLNT,FSNS,FLNS,SHFLX,QFLX,TAUX,TAUY,PRECC,PRECL,PRECSC,PRECSL,TS,TREFHT,CLDTOT,CLDHGH,CLDMED,CLDLOW,U,ICEFRAC,LANDFRAC,OCNFRAC,PS,CLDICE,CLDLIQ,T,AODDUST"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h1"
  frequency = "daily"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  # Needed for Wheeler Kiladis
  vars = "FLUT,PRECT,U850"

  [[ atm_monthly_glb ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = "glb"

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILICE,SOILLIQ,SOILWATER_10CM,TSA,TSOI,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
  extra_vars = "landfrac"
  ts_fmt = "cmip"

  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"
  extra_vars = 'areatotal2'

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"
  vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"

[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/$USER"
# Make walltime very short to reproduce this error
walltime = "00:10:00"
years = "1985:2014:30",

[e3sm_diags]
active = True
walltime = "4:00:00"
years = "1985:2014:30",
ts_num_years = 30
ref_start_yr = 1985
ref_final_yr = 2014
multiprocessing = True
num_workers = 8

  [[ atm_monthly_180x360_aave ]]
  short_name = 'v3.LR.historical_0051'
  grid = '180x360_aave'
  reference_data_path = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/climatology'
  obs_ts = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/time-series'
  dc_obs_climo = '/lcrc/group/e3sm/public_html/e3sm_diags_test_data/unit_test_complete_run/obs/climatology'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  ts_daily_subsection = "atm_daily_180x360_aave"
  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_aeronet","tropical_subseasonal","tc_analysis",
  output_format_subplot = "pdf",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  grid = 'native'
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  short_name = v3.LR.piControl
  ref_name = "20231209.v3.LR.piControl-spinup.chrysalis"
  ref_start_yr = 0051
  ref_final_yr = 0100
  ref_years = "0051-0100",
  reference_data_path = "/lcrc/group/e3sm/ac.zhang40/tests/20231209.v3.LR.piControl-spinup.chrysalis_land_diags/post/lnd/native/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "20231209.v3.LR.piControl-spinup"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 50

  [[atm_monthly_180x360_aave_mvm]]
  ref_years = "0001-0050",
  ref_start_yr = 1
  ref_final_yr = 50
  ts_num_years = 30 
  ts_num_years_ref = 10
  ts_subsection = "atm_monthly_180x360_aave"
  short_name = 'v3alpha04-COARE.piControl'
  grid = '180x360_aave'
  ref_name = '20230924.v3alpha04_trigrid.piControl.chrysalis'
  short_ref_name = 'v3alpha04-CTL.piControl'
  tag = 'v3alpha04i-COARE_vs_CTL'
  run_type = "model_vs_model"
  reference_data_path = '/lcrc/group/e3sm2/ac.xzheng/E3SMv3_dev/20230924.v3alpha04_trigrid.piControl.chrysalis/post/atm/180x360_aave/clim'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_subsection = "atm_monthly_180x360_aave"
  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_budget"
  diff_title = 'Difference'

[ilamb]
active = True 
nodes = 8
walltime = "2:00:00"
partition = compute 
short_name = 'v3.LR.historical_0051'
ts_num_years = 30
years = "1985:2014:30"

[global_time_series]
active = True
experiment_name = "v3.LR.historical_0051"
figstr = "v3.LR.historical_0051"
plots_atm = "TREFHT,AODDUST"
plots_lnd = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
ts_num_years = 30
walltime = "00:30:00"
years = "1985-2014",
climo_years ="1985-2014",
ts_years ="1985-2014",
moc_file = "mocTimeSeries_1985-2014.nc"

Note I had to remove duplicated subblocks from your cfg.

$ squeue -o "%8u %.7a %.4D %.9P %7i %.2t %.10r %.10M %.10l %j" --sort=P,-t,-p -u ac.forsyth2
USER     ACCOUNT NODE PARTITION JOBID   ST     REASON       TIME TIME_LIMIT NAME
ac.forsy    e3sm    1   compute 591484  PD Dependency       0:00    4:00:00 e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
ac.forsy    e3sm    1   compute 591485  PD Dependency       0:00    4:00:00 e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
ac.forsy    e3sm    1   compute 591486  PD Dependency       0:00    4:00:00 e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050

We see here that the three E3SM Diags task are all waiting on a dependency.

$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622/post/scripts

$ grep -v "OK" *status
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014.status:WAITING 591484
e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050.status:WAITING 591486
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status:WAITING 591485
tc_analysis_1985-2014.status:RUNNING 591483

$ tail -n 1 tc_analysis_1985-2014.o591483 
slurmstepd: error: *** JOB 591483 ON chr-0229 CANCELLED AT 2024-09-23T13:41:46 DUE TO TIME LIMIT ***

Note, your first point Status file shows "RUNNING" after slurm error is unfortuntely something I've never figured out a way around. That is, when SLURM hits a time limit, there is now no longer any time left on the job to update the status file to "ERROR." It said "RUNNING" when the time limit hit and at that point, no more changes can be made.

For your second point, on all three e3sm_diags runs waiting:

e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014 has:

  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_aeronet","tropical_subseasonal","tc_analysis",

This includes "tc_analysis", naturally it will not run.

e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050 has:

  sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_budget"

e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100 has:

sets = "lat_lon_land",

I added print(f"dependencies={dependencies}") in e3sm_diags.py. That gives me the following dependency lists:

e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
dependencies=[
    'climo_atm_monthly_180x360_aave_1985-2014.status', 
    'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status', 
    'tc_analysis_1985-2014.status', 
    'ts_atm_monthly_180x360_aave_1985-2014-0030.status', 
    'ts_atm_daily_180x360_aave_1985-2014-0030.status']

e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
dependencies=[
    'climo_atm_monthly_180x360_aave_1985-2014.status', 
    'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status', 
    'tc_analysis_1985-2014.status', 
    'ts_atm_monthly_180x360_aave_1985-2014-0030.status', 
    'ts_atm_daily_180x360_aave_1985-2014-0030.status']

e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050
dependencies=[
    'climo_atm_monthly_180x360_aave_1985-2014.status', 
    'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status', 
    'tc_analysis_1985-2014.status', 
    'ts_atm_monthly_180x360_aave_1985-2014-0030.status', 
    'ts_atm_daily_180x360_aave_1985-2014-0030.status', 
    'climo_atm_monthly_180x360_aave_1985-2014.status', 
    'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status', 
    'ts_atm_monthly_180x360_aave_1985-2014-0030.status']

It looks like dependencies get added as we go through the cfg, but we never start from fresh.

In e3sm_diags.py, we have:

    dependencies: List[str] = []

    for c in tasks:

Really, we should be resetting dependencies for each task...

I will make a PR for that.

@forsyth2
Copy link
Collaborator

Resolved by #631.

@chengzhuzhang
Copy link
Collaborator Author

chengzhuzhang commented Oct 16, 2024

@forsyth2 I'm just wondering have you tested if the zppy configuration works (#622 (comment)) with 3 e3sm_diags tasks running in parallel after recent two PRs?

@forsyth2
Copy link
Collaborator

I haven't tested 3 tasks in parallel explicitly. However, I did just run multiple tests in parallel for #632:

zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_chrysalis.cfg # Runs 1 `e3sm_diags` task.
zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_parallel_chrysalis.cfg # Runs 2 `e3sm_diags` task in parallel

I did run into some problems on the v3 side though, as described in #632 (comment).

I can also increase the number of parallel tasks in min_case_e3sm_diags_tc_analysis_v2_parallel to 3 or more if that would be valuable.

Or are you asking if I've run issue_622.cfg (from the linked comment) post-#623? I can also do that.

@chengzhuzhang
Copy link
Collaborator Author

Or are you asking if I've run issue_622.cfg (from the linked comment) post-#623? I can also do that.

Yes, I mean that we should make sure the second issue in 622 is resolved as intended.

@forsyth2
Copy link
Collaborator

forsyth2 commented Oct 16, 2024

Ok, with the tc_analysis task failing due to time limit reached, only the e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014 gets blocked, as intended. So I would say that #622 is indeed resolved.

I did however notice an unrelated error on the land task:

$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/scripts

$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status
ERROR (1)

$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.o607287
cp: cannot stat '/lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/v3.LR.historical_0051_*_1985??_2014??_climo.nc': No such file or directory

# But:
$ ls /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/
v3.LR.historical_0051_01_198501_201401_climo.nc  v3.LR.historical_0051_07_198507_201407_climo.nc  v3.LR.historical_0051_ANN_198501_201412_climo.nc
v3.LR.historical_0051_02_198502_201402_climo.nc  v3.LR.historical_0051_08_198508_201408_climo.nc  v3.LR.historical_0051_DJF_198501_201412_climo.nc
v3.LR.historical_0051_03_198503_201403_climo.nc  v3.LR.historical_0051_09_198509_201409_climo.nc  v3.LR.historical_0051_JJA_198506_201408_climo.nc
v3.LR.historical_0051_04_198504_201404_climo.nc  v3.LR.historical_0051_10_198510_201410_climo.nc  v3.LR.historical_0051_MAM_198503_201405_climo.nc
v3.LR.historical_0051_05_198505_201405_climo.nc  v3.LR.historical_0051_11_198511_201411_climo.nc  v3.LR.historical_0051_SON_198509_201411_climo.nc
v3.LR.historical_0051_06_198506_201406_climo.nc  v3.LR.historical_0051_12_198512_201412_climo.nc

@chengzhuzhang
Copy link
Collaborator Author

could it be a dependency issue? e3sm_diags_lnd_monthly_mvm_lnd task starts before climo files being generated?

@forsyth2
Copy link
Collaborator

So it is. This is bizarre. I'll look into it; it should be depending on climo.

$ grep dependencies e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.settings 
  'dependencies': [],

@chengzhuzhang chengzhuzhang reopened this Oct 16, 2024
@forsyth2 forsyth2 linked a pull request Oct 16, 2024 that will close this issue
15 tasks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
semver: bug Bug fix (will increment patch version)
Projects
None yet
2 participants