Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to DB qiita.slurm_resource_allocations #3401

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions notebooks/resource-allocation/upload_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd

# Example data loading
filename = './data/jobs_2024-02-21.tsv.gz'
df = pd.read_csv(filename, sep='\t', dtype={'extra_info': str})

# Convert string to timedelta, then to total seconds
df['ElapsedRawTime'] = pd.to_timedelta(
df['ElapsedRawTime']).apply(
lambda x: x.total_seconds())

cname = "Validate"
sname = "Diversity types - alpha_vector"
df = df[(df.cName == cname) & (df.sName == sname)]

columns_to_check = ['samples', 'columns', 'input_size', 'MaxRSSRaw', 'ElapsedRawTime']
df = df.dropna(subset=columns_to_check)


COL_NAME = 'samples * columns'
df[COL_NAME] = df['samples'] * df['columns']
columns = ["MaxRSSRaw", "ElapsedRawTime"]
max_rows = []

for curr in columns:
# Get the maximum value for 'curr' within each COL_NAME group
max_values = df.groupby(COL_NAME)[curr].transform(max)
# Filter rows where the current column's value
# is the maximum within its group
curr_rows = df[df[curr] == max_values]
max_rows.append(curr_rows)

filtered_df = pd.concat(max_rows).drop_duplicates().reset_index(drop=True)


print(len(filtered_df))


# INSERT INTO qiita.processing_job(processing_job_id, email, command_id,
# command_parameters, processing_job_status_id)
# VALUES('ca27ddbc-a678-4b09-8a1d-b65f52f8eb49',
# '[email protected]', 1, '""'::json, 1);

# INSERT INTO qiita.slurm_resource_allocations(processing_job_id, samples,
# columns, input_size, extra_info, memory_used, walltime_used)
# VALUES('ca27ddbc-a678-4b09-8a1d-b65f52f8eb49', 39, 81, 2, 'nan',
# 327036000, 91);

# processing_job_id uuid NOT NULL,
# samples integer,
# columns integer,
# input_size bigint,
# extra_info varchar DEFAULT NULL,
# memory_used bigint,
# walltime_used integer,

res = ""
print(df.ElapsedRaw, df.ElapsedRawTime)
for index, row in filtered_df.iterrows():
res += f"""('{row['QiitaID']}', '[email protected]', 1, '""'::json, 1),\n"""
res += ";\n"
res += "Split\n"
for index, row in filtered_df.iterrows():
res += (
f"('{row['QiitaID']}', {int(row['samples'])}, "
f"{int(row['columns'])}, {int(row['input_size'])}, "
f"'{row['extra_info']}', {int(row['MaxRSSRaw'])}, "
f"{int(row['ElapsedRawTime'])}),\n"
)

res += ";\n"

with open("sql.txt", 'w') as filename:
filename.write(res)
Loading