From 63bd89efeb1447a0213805331fcea331f7bbc23d Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 18 Jul 2024 04:02:07 +0000 Subject: [PATCH 01/14] Added quickTunerGen.py, the main driver of the quick tuner scripts which generates quick tuner perf configs given the dataframe/tsv file produced by quickTunerPreproc.py. --- mlir/utils/performance/quickTunerGen.py | 1145 +++++++++++++++++++++++ 1 file changed, 1145 insertions(+) create mode 100644 mlir/utils/performance/quickTunerGen.py diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py new file mode 100644 index 000000000000..76d38d0a142c --- /dev/null +++ b/mlir/utils/performance/quickTunerGen.py @@ -0,0 +1,1145 @@ +#!/usr/bin/env python3 + +""" +quickTuner script to generate quick tuner perf configs. Uses single input file from quickTunerPreproc.py +as input. +Needs the input to be a combined normalized dataframe (default from quickTunerPreproc.py) + +Usage: clusterConfigs.py [-h] --input-file INPUT_FILE [--method {default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} [{default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} ...]] [--save] [--debug] [--num NUM] + +Example Usage: + +python3 quickTunerGen.py --input-file TESTFILE.out --method fairSelect --save --debug --num 20 + +Will read TESTFILE.out then generate a quick tune list of length 20 for each datatype in TESTFILE.out. Will +both print these lists and save them to METHODNAME.DTYPE.qt. +""" + +import os +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['OMP_NUM_THREADS'] = '1' +import sys + +sys.path.append('../..') + +import argparse +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import KMeans +from sklearn.cluster import MiniBatchKMeans +from sklearn import metrics +from sklearn.cluster import DBSCAN +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import silhouette_score +from collections import defaultdict +import matplotlib.pyplot as plt +import seaborn as sns + +import faulthandler +import re +import glob + +class quickTunerMethod(object): + """ + base class for creating quick tuner methods, implement the getConfig() method. + """ + def __init__(self, name=None, N=40): + self.N = N + self.config = None + if name is None: + self.name = self.__class__.__name__ + else: + self.name = name + + + def setN(self, N): + """ + To set the current N count (number of configs) + """ + self.N = N + + def saveQt(self, name=None, debug=False, suffix=".qt"): + """ + Function to convert a type dictionary config to a .qt file + Converts the list of quickTuning sets into a group of files + """ + type_df = self.config + if name is None: + name = self.name + if debug: + print(filename + suffix) + printConfigDict(type_df) + for t in type_df: + fname = name + "." + t + suffix + df = type_df[t] + if 'performance' in df.columns: + df = df.drop(labels=['performance'], axis=1) + df = df.to_csv(fname, index=False) + + def savePerfConfig(self, name=None, dtype=None, prefix="v2:"): + """ + """ + + type_df = self.config + if name is None: + name = self.name + + if dtype: + with open(name, 'w') as f: + for _, row in type_df[dtype].iterrows(): + tup = tuple(row) + s = prefix+",".join(map(str,tup)) + f.write(s) + f.write("\n") + + else: + for t in type_df: + with open(f"{name}_{dtype}", 'w') as f: + for _, row in type_df[t].iterrows(): + tup = tuple(row) + s = prefix+",".join(map(str,tup)) + f.write(s) + f.write("\n") + + + def getConfig(self, combined_df): + """ + produces a config that can be converted to .qt file using + convertToConfig + """ + raise NotImplementedError() + + +class quickTuner(object): + """ + quickTuner class to run quick tuning methods from, requires user to instantiate quickTuner object + then register quickTunerMethod child classes, finally run tune() + """ + def __init__(self, pargs): + self.methods = {} + self.N = pargs.num + """ + maybe something like + if self.input_dir: + self.combined_df = qtPreprocessor.process(self.input_dir) + else: + self.combined_df = self.input_file + """ + self.input_file = pargs.input_file + self.combined_df = pd.read_csv(self.input_file, sep='\t') + #self.__parseValidationArgs(pargs) uneeded + self.__parseMethods(pargs) + + def __parseMethods(self, pargs): + """ + parse each method in pargs.method + """ + gen_methods = pargs.method + for method in gen_methods: + if method == 'default': + self.addMethod(defaultQuickTune(method, self.N)) + elif method == 'topNSelect': + self.addMethod(topNSelection(method, self.N)) + elif method == 'topMode': + self.addMethod(topMode(method, self.N)) + elif method == 'takeNEach': + self.addMethod(takeNEach(method, self.N)) + elif method == 'fairSelect': + self.addMethod(fairSelect(method, self.N)) + else: + raise ValueError(f"Unknown method: {method}") + + def __parseValidationArgs(self, pargs): + """ + parses pargs.validate string for validator + """ + kwargs = {} + for item in pargs.vargs: + if '=' in item: + k, v = item.split('=', 1) + kwargs[k] = v + else: + raise ValueError(f"Argument {item} is not a valid key=value pair") + + # leftovers from pargs.validate + #if pargs.validate and pargs.validate == 'data': + # init validator + # self.validator = dataValidator(pargs.input_file,**kwargs) + #else: + # self.validator = None + + def addMethod(self, method: quickTunerMethod): + """ + Adds method to method dict + """ + self.methods[method.name] = method + + def tune(self): + self.method_results = {} + if not self.methods: + print("No methods are registered, use quickTuner.addMethod(method: quickTunerMethod), to add a method", file=sys.stderr) + exit(1) + else: + for k in self.methods: + method = self.methods[k] + df = method.getConfig(self.combined_df.copy()) + self.method_results[k] = df + + """ moved to quickTunerStat.py + def validate(self): + #Validate on either a dataset or by running rocmlir-tuning-gen + if self.validator is None: + print("validator not set", file=sys.stderr) + return + output_dict = {} + for method in self.method_results: + # df will be of the form: {type1: [data], type2: [data], ..., typeN: [data]} + for dtype in self.method_results[method]: + if dtype not in output_dict: + output_dict[dtype] = {} + gemm_data = self.validator.validate(self.method_results[method][dtype], dtype) + + + for df in gemm_data: # for every gemm config we get data back + ct = 0 + max_values = [] + threshold = 0.92 + for df in gemm_data: + if (df['performance'].dropna() <= threshold).all(): + #print(f"{name} does not meet threshold (>0.8): {df}") + ct += 1 + #max_values.append(df[column].max()) + output_dict[dtype][method] = ct + + self.output_df = pd.DataFrame(output_dict) + print(self.output_df) + """ + + def saveConfigs(self, debug=False): + """ + Iterate through methods and save to each file + """ + for k in self.methods: + method = self.methods[k] + method.saveQt() + + def printConfigs(self): + """ + Print method's data + """ + if not self.method_results: + raise ValueError("Method results not generated") + for k in self.method_results: + for dtype in self.method_results[k]: + print(f"dtype: {dtype}\n{self.method_results[k][dtype]}\n") + + def saveBest(self): + """ + Save the best method + """ + df = self.output_df + + min_values = df.min() + best_methods = df.idxmin() + + method_counts = best_methods.value_counts() + + max_count = method_counts.max() + majority_methods = method_counts[method_counts == max_count].index + + result_methods = {} + for col in df.columns: + candidates = df.loc[majority_methods, col] + result_methods[col] = candidates.idxmin() + + # Create a list of tuples with index and corresponding method + output = [(index, method) for index, method in result_methods.items()] + + for entry in output: + dtype, method = entry + self.methods[method].savePerfConfig(f"quick_tuning_{dtype}", dtype) + + + + + +""" +Common methods +""" + +def orderDict(type_dict: dict): + """ + order dictionary, removing nan along the way + """ + + for k,v in type_dict.items(): + df = type_dict[k] + + type_dict[k] = df.sort_values(by=['performance'], ascending=False, ignore_index=True) + + return type_dict + +def orderGemmDict(type_gemm_dict: dict): + """ + order type dictionary with sub dict with gemms, removing nan along the way + """ + for k, v in type_gemm_dict.items(): + for sub_dict in v: + df = v[sub_dict] + df = df.dropna(how='any') + + type_gemm_dict[k][sub_dict] = df.sort_values(by=['performance'], ascending=False, ignore_index=True) + + return type_gemm_dict + +def parseData(file): + """ + reads a file then returns a dataframe containing the + perf config data + """ + data = pd.read_csv(file, + delim_whitespace=True, + header=None, + names=['perf_config', 'performance'], + comment='#') + + data['perf_config'] = data['perf_config'].str.split(':').str[1] + + tile_params = data['perf_config'].str.split(',', expand=True).astype(int) + + tile_params.columns = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + + + tile_params = tile_params.drop(['param8','param9'], axis=1) + + tile_params['performance'] = data['performance'] + + tile_params.replace('N/A', np.nan, inplace=True) + + return tile_params + +def printConfigDict(df): + for k, v in df.items(): + print(f"k:{k}\nv\n:{v}") + + +def readDirCluster(input_file: str, clustered_dfs): + """ + Given an input directory and the cluster dataframe, + read the cluster's by datatype into a dataframe, order + the data and take some portion of the maxes + Change for combined_df + """ + label_dict = {} + for label, cluster_df in clustered_dfs.items(): + # iterate through the cluster and grab all the files needed + type_dict = {} + for _, row in cluster_df.iterrows(): + g = row['g'] + m = row['m'] + n = row['n'] + k = row['k'] + transA = "true" if row['transA'] == 1 else 'false' + transB = "true" if row['transB'] == 1 else 'false' + + dir_str = f"g{g}_m{m}_n{n}_k{k}" + dir_path = os.path.join(input_file, dir_str) + + for root, _, files in os.walk(dir_path): + for file in files: + + if f"-{transA}_{transB}_" in file: + basename = os.path.basename(file) + type_str = basename.split('-')[0].split('_') + in_type_str = type_str[0] + out_type_str = type_str[1] + if in_type_str != out_type_str: + continue + + tile_params = parseData(os.path.join(root,file)) + + if in_type_str not in type_dict: + type_dict[in_type_str] = [tile_params] + else: + type_dict[in_type_str].append(tile_params) + + + for k in type_dict: + type_dict[k] = pd.concat(type_dict[k]) + label_dict[label] = type_dict + + return label_dict + + +def parseDir(input_file: str, normalize=True): + + final_df = input_file + + trans_cols = ['TransA', 'TransB'] + + param_cols = [ 'G', 'M', 'N','K'] + + final_df = final_df.astype({entry: bool for entry in trans_cols}) + + final_df = final_df.astype({entry: int for entry in param_cols}) + + target_cols = trans_cols + param_cols + + group_df = {dtype: df for dtype, df in final_df[target_cols].groupby('DataType')} + + return group_df + +def parseDir2(input_file: str, normalize=True): + + df_dir = {} + + tsv_files = glob.glob(f"{input_file}/*.debug") + + for file in tsv_files: + df = pd.read_csv(file, sep='\t') + if normalize: + scaler = MinMaxScaler() + df['TFlops'] = scaler.fit_transform(df[['TFlops']]) + dfs.append(df) + + final_df = pd.concat(dfs, ignore_index=True) + + trans_cols = ['TransA', 'TransB'] + + param_cols = [ 'G', 'M', 'N','K'] + + final_df = final_df.astype({entry: bool for entry in trans_cols}) + + final_df = final_df.astype({entry: int for entry in param_cols}) + + target_cols = trans_cols + param_cols + + group_df = {dtype: df for dtype, df in final_df[target_cols].groupby('DataType')} + + return group_df + + +def orderByType(combined_df: str, normalize=False): + + final_df = combined_df + unique_data_types = final_df['DataType'].unique() + + perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + + perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) + + perf_configs.columns = perf_config_cols + + perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) + + perf_configs['performance'] = final_df['TFlops'] + + perf_configs['DataType'] = final_df['DataType'] + + if normalize: + scaler = MinMaxScaler() + perf_configs['performance'] = scaler.fit_transform(perf_configs[['performance']]) + + result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in perf_configs.groupby('DataType')} + + return result + +def orderByGemmType(combined_df: str, normalize=True): + + final_df = combined_df + + trans_cols = ['TransA', 'TransB'] + + param_cols = [ 'G', 'M', 'N','K'] + + final_df = final_df.astype({entry: bool for entry in trans_cols}) + + final_df = final_df.astype({entry: int for entry in param_cols}) + + target_cols = trans_cols + param_cols + + perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + + perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) + + perf_configs.columns = perf_config_cols + + perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) + + perf_configs['performance'] = final_df['TFlops'] + + perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) + + grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in perf_configs.groupby(['DataType'])} + + for k in grouped: + group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} + grouped[k] = group + + return grouped + + +def convertToConfig(type_df, filename, suffix=".qt", debug=False): + """ + Converts the list of quickTuning sets into a group of files + """ + if debug: + print(filename + suffix) + printConfigDict(type_df) + + for t in type_df: + fname = filename + "." + t + suffix + df = type_df[t] + if 'performance' in df.columns: + df = df.drop(labels=['performance'], axis=1) + df['forceUnroll'] = 1 + df = df.to_csv(fname, index=False) + +""" +Hardcoded tuner method +""" + +class hardcodeQuickTune(quickTunerMethod): + """ + Default quick tune method, uses preset values for the config file + """ + def __init__(self, name=None): + super().__init__(name) + self.default_f32 = pd.DataFrame({ + "M/block": [256, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], + "N/block": [256, 64, 128, 128, 128, 64, 64, 64, 64, 64, 32, 16, 256, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 16, 128, 128, 64, 64, 32, 32, 16, 16, 32, 32, 16, 16], + "K/block": [2, 8, 8, 4, 2, 8, 8, 8, 4, 2, 4, 4, 8, 4, 4, 4, 2, 8, 8, 8, 8, 4, 4, 8, 4, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 4, 8, 4, 8], + "M/wave": [128, 128, 64, 128, 32, 64, 32, 32, 32, 128, 128, 32, 64, 64, 64, 32, 32, 32, 16, 32, 16, 32, 16, 64, 32, 16, 16, 16, 32, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16], + "N/wave": [32, 32, 16, 32, 32, 16, 32, 16, 32, 32, 16, 16, 16, 32, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + "kPack": [4, 1, 4, 4, 8, 1, 4, 1, 4, 4, 4, 8, 4, 1, 4, 4, 8, 4, 4, 4, 8, 4, 8, 8, 8, 4, 4, 8, 1, 4, 4, 4, 8, 4, 8, 8, 4, 8, 4, 8], + "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + }) + + + self.default_f16 = pd.DataFrame({ + "M/block": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], + "N/block": [256, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 32, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 32, 32, 16, 128, 64, 64, 32, 32, 16, 128, 32, 64, 32], + "K/block": [8, 4, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 8, 4, 8, 8, 8, 8, 4, 4, 8, 8, 8, 8, 4, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8], + "M/wave": [64, 64, 128, 64, 32, 32, 128, 128, 64, 64, 32, 128, 32, 32, 64, 32, 32, 32, 64, 32, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16], + "N/wave": [32, 32, 32, 32, 32, 16, 32, 16, 32, 16, 32, 16, 32, 32, 16, 32, 16, 16, 32, 16, 32, 32, 32, 16, 16, 32, 32, 32, 16, 16, 32, 32, 16, 32, 16, 16, 16, 16, 16, 16], + "kPack": [4, 8, 8, 4, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 4], + "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + }) + + self.default_i8 = pd.DataFrame({ + "M/block": [128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], + "N/block": [256, 128, 128, 128, 128, 64, 64, 64, 64, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 16, 256, 256, 128, 64, 64, 64, 64, 32, 32, 16, 64, 32, 16, 16], + "K/block": [8, 16, 8, 8, 8, 32, 8, 8, 4, 32, 16, 8, 4, 8, 16, 8, 8, 4, 4, 16, 16, 16, 8, 8, 8, 8, 16, 4, 32, 32, 16, 8, 4, 32, 16, 16, 16, 16, 32, 16], + "M/wave": [128, 64, 128, 64, 32, 64, 32, 32, 32, 64, 32, 64, 32, 32, 32, 32, 32, 32, 32, 32, 16, 32, 16, 32, 32, 16, 32, 32, 32, 16, 32, 16, 32, 16, 16, 16, 16, 16, 16, 16], + "N/wave": [16, 32, 16, 16, 16, 32, 32, 16, 16, 32, 16, 16, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], + "kPack": [4, 8, 8, 8, 16, 4, 16, 16, 16, 4, 4, 8, 16, 8, 4, 16, 16, 16, 8, 4, 16, 4, 16, 16, 8, 16, 4, 8, 4, 4, 4, 16, 8, 4, 8, 8, 4, 16, 4, 4], + "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + }) + + self.config = { 'f32': self.default_f32, 'f16': self.default_f16, 'i8': self.default_i8 } + + def getConfig(self, input_file): + """ + returns the already made config + """ + return self.config + +""" + +Place derived quickTunerMethod classes below here: + +""" + +class topNSelection(quickTunerMethod): + """ + splits data by type then splits into certain percentage evenly, + taking the top performers from each group + """ + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def getConfig(self, combined_df): + type_dict = orderByType(input_file, normalize=self.normalize) + + type_dict = orderDict(type_dict) + + config_dict = {} + + for k,v in type_dict.items(): + num_segments = self.N // 2 + seg_size = len(v) // num_segments + selected_configs = pd.concat([v.iloc[i * seg_size:(i+1) * seg_size].head(2) for i in range(num_segments)]) + + config_dict[k] = selected_configs + + self.config = config_dict + return self.config + +class topMode(quickTunerMethod): + """ + get most common of all gemms + """ + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def getConfig(self, input_file): + config_dict = {} + + type_gemm_dict = orderByGemmType(input_file, normalize=self.normalize) + + type_gemm_dict = orderGemmDict(type_gemm_dict) + + for k, v in type_gemm_dict.items(): + combined = [] + for sub_key in v: + df = v[sub_key] + sorted_df = df.sort_values(by='performance', ascending=False) + top_20_percent_df = sorted_df.head(int(len(df) * 0.005)) + combined.append(top_20_percent_df) + + df = pd.concat(combined) + + # now we have a list of the gemms in combined + # remove any repetitions and order by appearance + grouped_df = df.groupby(['M/block','N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'], as_index=False).agg({'performance': 'count'}).rename(columns={'performance': 'count'}) + + result_df = pd.merge(df, grouped_df, on=['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll']) + + final_df = result_df.loc[result_df.groupby(['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'])['performance'].idxmax()] + + final_df = final_df.sort_values(by=['count', 'performance'], ascending=[False, False]) + + config_dict[k] = final_df.head(self.N) + + self.config = config_dict + return self.config + + +class takeNEach(quickTunerMethod): + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def getConfig(self, combined_df): + """ + take top performers from N dataframes + """ + config_dict = {} + + type_gemm_dict = orderByGemmType(input_file, normalize=self.normalize) + + type_gemm_dict = orderGemmDict(type_gemm_dict) + + # calculate size for amount to take + + N = self.N + + for k, v in type_gemm_dict.items(): + sub_dict_size = len(v) + subset_size = N // sub_dict_size + if subset_size == 0: + subset_size = 1 + + type_df = [] + for sub_key in v: + # order and take top N, + df = v[sub_key] + df = df.sort_values(by='performance', ascending=False) + df = df.head(subset_size) + type_df.append(df) + + type_df = pd.concat(type_df) + type_df = type_df.sort_values(by='performance', ascending=False) + type_df = type_df.head(N) + + config_dict[k] = type_df + + self.config = config_dict + return self.config + + +class topConfigCluster(quickTunerMethod): + """ + Cluster each run, take sample from total + """ + + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def getConfig(self, combined_df): + N=self.N + n_clusters = N//2 + type_dict = orderByType(input_file, normalize=self.normalize) + + type_dict = orderDict(type_dict) + + result_dict = {} + + features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + + # now we have normalized data + for k,df in type_dict.items(): + try: + # cluster each type + + + features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'performance'] + + + scaler = StandardScaler() + features_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features) + + + features_scaled['performance'] = df['performance'] + + # use silhouette score for optimal clustering + silhouette_scores = [] + upper_limit = min(len(df) // 10, 20) # Adjust 20 or the divisor based on your heuristic + cluster_range = range(2, upper_limit) + for n_clusters in cluster_range: + mb_kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, n_init=10, random_state=42) + cluster_labels = mb_kmeans.fit_predict(features_scaled[features]) + silhouette_avg = silhouette_score(features_scaled[features], cluster_labels) + silhouette_scores.append((n_clusters, silhouette_avg)) + df['cluster'] = mb_kmeans.fit_predict(features_scaled[features]) + + #kmeans = KMeans(n_clusters=n_clusters) + #df['clusters'] = kmeans.fit_predict(features_scaled[features]) + # + #representative_set = df.groupby('cluster').apply(lambda x: x.sample(2)) + #print(representative_set) + + # get optimal clusters + optimal_n = max(silhouette_scores, key=lambda x: x[1])[0] + + # run clustering with optimal n + mb_kmeans = MiniBatchKMeans(n_clusters=optimal_n, batch_size=100, n_init=10, random_state=42) + + # get proper proportion use mAtH + proportion = int(N // optimal_n) + representative_set = df.groupby('cluster').apply(lambda x: x.nlargest(proportion, 'performance')).reset_index(drop=True) + + # Sort each group by 'performance' in descending order and take the top 2 rows + #representative_set = df.groupby('cluster').apply(lambda x: x.nlargest(2, 'performance')).reset_index(drop=True) + #print(representative_set) + + #representative_set = representative_set.drop(['cluster'], axis=1) + + result_dict[k] = representative_set.drop(['cluster'], axis=1) + except Exception as e: + print(f"Error processing type {k}: {e}", file=sys.stderr) + continue + + + self.config = result_dict + return self.config + +class topGemmCluster(quickTunerMethod): + """ + Group each GEMM config, for each cluster take the top performer, + this allows each cluster to have similiarities between GEMMs which + would hopefully contribute to a similar perf config peformance. + """ + + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def getConfig(self, combined_df): + N = self.N + def calculateProportions(input_list, N=40): + # Count the occurrences of each unique value + count_dict = {} + for item in input_list: + if item in count_dict: + count_dict[item] += 1 + else: + count_dict[item] = 1 + + total_elements = len(input_list) + + raw_percentages = [count_dict[key] / total_elements for key in sorted(count_dict.keys())] + + rounded_percentages = [round(p, 2) for p in raw_percentages] + + total_rounded = sum(rounded_percentages) + difference = round(1 - total_rounded, 2) + + if difference != 0: + max_index = rounded_percentages.index(max(rounded_percentages)) + rounded_percentages[max_index] = round(rounded_percentages[max_index] + difference, 2) + + proportions = [round(p * N) for p in rounded_percentages] + + # Adjust the proportions to ensure the sum is exactly N + total_proportions = sum(proportions) + difference_proportions = N - total_proportions + + if difference_proportions != 0: + max_index = proportions.index(max(proportions)) + proportions[max_index] += difference_proportions + + return proportions + + gemm_df = parseDir(input_file) + + scaler = StandardScaler() + features_scaled = scaler.fit_transform(gemm_df) + + # determine the optimal number of clusters using the elbow method + inertia = [] + for k in range(2, 11): + kmeans = KMeans(n_clusters=k, random_state=0).fit(features_scaled) + inertia.append(kmeans.inertia_) + + + second_derivative = np.diff(np.diff(inertia)) + + # The elbow point is the point with the maximum second derivative + optimal_k = np.argmax(second_derivative) + 2 + + kmeans = KMeans(n_clusters=optimal_k, random_state=0).fit(features_scaled) + labels = kmeans.labels_ + + gemm_df['cluster'] = labels + + label_proportions = calculateProportions(labels, N) + + clustered_dfs = {label: gemm_df[gemm_df['cluster'] == label] for label in gemm_df['cluster'].unique()} + + # iterate through the clustered dataframes + label_dict = readDirCluster(input_file, clustered_dfs) + data_dict = {} + for label, type_dict in label_dict.items(): + + for dtype in type_dict: + df = type_dict[dtype] + df = df.sort_values(by='performance',ascending=False) + df = df.head(label_proportions[label]) + if dtype not in data_dict: + data_dict[dtype] = df + else: + data_dict[dtype] = pd.concat([data_dict[dtype], df], ignore_index=True) + + self.config = data_dict + return self.config + +class defaultQuickTune(quickTunerMethod): + """ + take entire set and aggregate the repeats, averaging them out/ weighing them more heavily + """ + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + self.N + + def __data2df(self, data): + def split_str(s): + return s.split(':')[-1].split(',') + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + df_dict = {} + for k in data: + for i,n in enumerate(split_str(k)): + col = cols[i] + if col not in df_dict: + df_dict[col] = [] + df_dict[col].append(int(n)) + return pd.DataFrame(df_dict) + + def __get_value(self, data_dict, data_type, perfconfig): + try: + return data_dict[data_type][perfconfig] + except KeyError: + return -1 + + # add averge tflops to wining perf configs + def __add_average_tflops(self, counted_perf,avrg_tflops): + for datatype, value in counted_perf.items(): + for perfconfig, perf_value in value.items(): + avg_value = self.__get_value(avrg_tflops, datatype, perfconfig) + perf_value['tflops'] = avg_value + + # get the perf config with the maximum tflops + def __get_max_tflops_perfconfig(self, group): + max_index = group['TFlops'].idxmax() + max_row = group.loc[max_index] + perf_config = max_row['PerfConfig'] + group.drop(max_index, inplace=True) + return perf_config + + def __analyzeData(self, combined_df, avrg_tfops_per_datatype): + tsv_files = pd.DataFrame() + final_df = combined_df + unique_data_types = final_df['DataType'].unique() + # iterate through unique data type + results = {} + operations = ['gemm'] + for data_type in unique_data_types: + win_counts = {} + for operation in operations: + current_df = final_df[final_df['DataType'] == data_type] + problem_cols = [] + # determine the problem columns based on operation type + if operation == "conv": + problem_cols = ['N', 'C', 'K', 'Y', 'X', 'DilationH', 'DilationW', 'StrideH', 'StrideW', 'PaddingH', 'PaddingW'] + elif operation == 'gemm': + problem_cols = ['TransA', 'TransB', 'G', 'M', 'K', 'N'] + else: + raise Exception("Operation not recognized") + grouped = current_df.groupby(problem_cols) + # iterate through the grouped df + for name, group_df in grouped: + avg_value = -1 + max_tflops_perfconfig = {} + # checking if the perf config applies to all tuned perfs + while avg_value == -1: + max_tflops_perfconfig = self.__get_max_tflops_perfconfig(group_df) + avg_value = self.__get_value(avrg_tfops_per_datatype, data_type, max_tflops_perfconfig) + if max_tflops_perfconfig not in win_counts: + win_counts[max_tflops_perfconfig] = {'count': 0, 'tflops': 0} + win_counts[max_tflops_perfconfig]['count'] += 1 + results[data_type] = win_counts + return results + + def __averagePerformance(self, combined_df): + final_df = combined_df + unique_data_types = final_df['DataType'].unique() + result = {} + # iterating through unique data types + for data_type in unique_data_types: + current_df = final_df[final_df['DataType'] == data_type] + fgroups = current_df.groupby('PerfConfig') + not_nan_counts = {} + mean_tflops = {} + problems_count = 0 + # iterating through perf configs in grouped dfs + for perfconfig, group_df in fgroups: + if problems_count < len(group_df): + problems_count = len(group_df) + not_nan_count = pd.notna(group_df['TFlops']).sum() + not_nan_counts[perfconfig] = not_nan_count + mean_tflops[perfconfig] = group_df['TFlops'].mean() + sorted_counts = sorted(not_nan_counts.items(), key=lambda x: x[1], reverse=True) + top_perfconfigs = {perfconfig: mean_tflops[perfconfig] for perfconfig, count in sorted_counts if count == problems_count} + result[data_type] = top_perfconfigs + return result + + def getConfig(self, combined_df): + avrg_tfops_per_datatype = self.__averagePerformance(combined_df) + counted_win = self.__analyzeData(combined_df, avrg_tfops_per_datatype) + self.__add_average_tflops(counted_win,avrg_tfops_per_datatype) + sorted_data = {} + for datatype, configs in counted_win.items(): + # sort the configs dictionary by 'count' and 'tflops' + sorted_configs = dict(sorted(configs.items(), key=lambda item: (-item[1]['count'], -item[1]['tflops']))) + sorted_data[datatype] = sorted_configs + + df_dict = {} + for datatype, value in sorted_data.items(): + df_dict[datatype] = self.__data2df(value).head(self.N) + + self.config = df_dict + return df_dict + +class fairSelect(quickTunerMethod): + """ + take entire set and aggregate the repeats, averaging them out/ weighing them more heavily + """ + def __init__(self, name=None, N=40, normalize=True): + super().__init__(name, N) + self.normalize = normalize + + def __get_top_90_percent(self, df): + df_sorted = df.sort_values(by='performance', ascending=False) + return df_sorted[df_sorted['performance'] >= 0.95] + + def __combine_datasets(self, dfs): + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + combined_df = pd.concat(dfs).sort_values(by='performance', ascending=False) + combined_df = combined_df.drop_duplicates(subset=cols, keep='first') + return combined_df + + def __aggregate_datasets(self, dfs): + feature_dict = defaultdict(list) + count_dict = defaultdict(int) + max_label_dict = {} + df_dict = {} # from id to dataframe + + for df in dfs: + df_id = id(df) + df_dict[df_id] = df + for _, row in df.iterrows(): + feature_vector = tuple(row[:-1]) # get feature (all but performance) + label = row[-1] + feature_dict[feature_vector].append(df_id) + count_dict[feature_vector] += 1 + if feature_vector not in max_label_dict or label > max_label_dict[feature_vector]: + max_label_dict[feature_vector] = label + + return feature_dict, count_dict, max_label_dict, df_dict + + def __balance_datasets(self, combined_df, original_dfs): + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + selected_features = set() + balanced_dataset = [] + + for i in range(len(original_dfs)): + if len(balanced_dataset) >= 40: + break + df = original_dfs[i] + + for _, row in df.iterrows(): + feature_tuple = tuple(row[cols]) + + if feature_tuple not in selected_features: + selected_features.add(feature_tuple) + balanced_dataset.append(feature_tuple) + break + + for _, row in combined_df.iterrows(): + if len(balanced_dataset) >= 30: + break + feature_tuple = tuple(row[cols]) + + if feature_tuple not in selected_features: + selected_features.add(feature_tuple) + balanced_dataset.append(row) + + balanced_dataset_df = pd.DataFrame(balanced_dataset, columns=cols) + + return balanced_dataset_df + + def __build_final_df(self, top_dfs): + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + # aggregate common feature vectors + feature_dict, count_dict, max_label_dict, df_dict = self.__aggregate_datasets(top_dfs) + highest_perfs = self.__combine_datasets(top_dfs) + + # sort features by count and max label + sorted_features = sorted(count_dict.keys(), key=lambda x: (-count_dict[x], -max_label_dict[x])) + + # int final dataset and keep track of added features + final_dataset = [] + added_features = set() + used_dfs = set() + + + for feature in sorted_features: + if feature not in added_features: + # find the dataframes containing this feature + containing_dfs = feature_dict[feature] + if not any(df_id in used_dfs for df_id in containing_dfs): + # add the feature with its maximum label + final_dataset.append(feature) + added_features.add(feature) + # mark the dataframes as used + for df_id in containing_dfs: + used_dfs.add(df_id) + # used all labels + if len(used_dfs) == len(top_dfs): + break + top = set([id(df) for df in top_dfs]) + used = set() + for d in final_dataset: + for did in feature_dict[d]: + used.add(did) + + diff = top.difference(used) + for df_id in diff: + df = df_dict[df_id] + for _, row in df.iterrows(): + feature = tuple(row[:-1]) + if feature not in added_features: + added_features.add(feature) + final_dataset.append(feature) + break + + if len(final_dataset) < self.N: + for _, row in highest_perfs.iterrows(): + feature = tuple(row[:-1]) + if feature not in added_features: + added_features.add(feature) + final_dataset.append(feature) + if len(final_dataset) >= self.N: + break + # add more high performers + + + return pd.DataFrame(final_dataset, columns=cols).head(self.N) # though this should really not be set + + def getConfig(self, combined_df): + config_dict = {} + + type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) + + type_gemm_dict = orderGemmDict(type_gemm_dict) + + for dtype, dfs in type_gemm_dict.items(): + + top_90_percent = [] + for cfg in dfs: + df = dfs[cfg] + top_90_percent.append(self.__get_top_90_percent(df)) + + config_dict[dtype] = self.__build_final_df(top_90_percent) + + self.config = config_dict + return config_dict + + + +def main(args=None): + if args is None: + args = sys.argv[1:] + + parser = argparse.ArgumentParser(prog='clusterConfigs.py', + description='Bunch together runs into a parallel dir named ../{DIR_NAME}-bunch') + + parser.add_argument('--input-file', + required=True, + type=str) + + parser.add_argument('--method', + nargs='+', + choices=["default","topNSelect","topMode","takeNEach","fairSelect","hardcoded"], + default=["default","fairSelect"], + help='Select perfConfig gen selection method') + + parser.add_argument('--save', + action='store_true', + default=False, + help='Save configs to name.dtype.qt') + + parser.add_argument('--debug', + action='store_true', + default=False, + help='Print debug info, print config files to stdout') + + parser.add_argument('--num', '-n', + type=int, + default=40, + help='Number of perf configs to include') + + + + pargs = parser.parse_args() + + tuner = quickTuner(pargs) + + tuner.tune() + + if pargs.save: + tuner.saveConfigs() + + if pargs.debug: + tuner.printConfigs() + + + +if __name__ == '__main__': + main(sys.argv[1:]) + From 7e040c9b9243b96a4354c6136be43ed7b28d8cd5 Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 18 Jul 2024 15:46:09 +0000 Subject: [PATCH 02/14] Added directory path saving and pef config formatting --- mlir/utils/performance/quickTunerGen.py | 48 ++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 76d38d0a142c..0739088ce4a4 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -5,14 +5,13 @@ as input. Needs the input to be a combined normalized dataframe (default from quickTunerPreproc.py) -Usage: clusterConfigs.py [-h] --input-file INPUT_FILE [--method {default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} [{default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} ...]] [--save] [--debug] [--num NUM] +Usage: clusterConfigs.py [-h] --input-file INPUT_FILE [--method {default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} [{default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} ...]] [--save] [--debug] [--num NUM] [--perfconfig--format] Example Usage: python3 quickTunerGen.py --input-file TESTFILE.out --method fairSelect --save --debug --num 20 -Will read TESTFILE.out then generate a quick tune list of length 20 for each datatype in TESTFILE.out. Will -both print these lists and save them to METHODNAME.DTYPE.qt. +Will read TESTFILE.out then generate a quick tune list of length 20 for each datatype in TESTFILE.out. Will print these lists and save them to METHODNAME.DTYPE.qt. """ import os @@ -51,7 +50,14 @@ def __init__(self, name=None, N=40): self.name = self.__class__.__name__ else: self.name = name - + + def __perfconfig_formatter(self, df, prefix="v2:"): + """ + Add prefix to first column and remove header + """ + df.iloc[:, 0] = prefix + df.iloc[:, 0].astype(str) + return df + def setN(self, N): """ @@ -59,7 +65,7 @@ def setN(self, N): """ self.N = N - def saveQt(self, name=None, debug=False, suffix=".qt"): + def saveQt(self, name=None, directory=None, debug=False, suffix=".qt", pf_format=False): """ Function to convert a type dictionary config to a .qt file Converts the list of quickTuning sets into a group of files @@ -72,10 +78,17 @@ def saveQt(self, name=None, debug=False, suffix=".qt"): printConfigDict(type_df) for t in type_df: fname = name + "." + t + suffix + if directory: + fname = os.path.join(directory, fname) df = type_df[t] if 'performance' in df.columns: df = df.drop(labels=['performance'], axis=1) - df = df.to_csv(fname, index=False) + df = df.astype(int) + header = True + if pf_format: + df = self.__perfconfig_formatter(df) + header = False + df = df.to_csv(fname, index=False, header=header) def savePerfConfig(self, name=None, dtype=None, prefix="v2:"): """ @@ -119,6 +132,7 @@ class quickTuner(object): def __init__(self, pargs): self.methods = {} self.N = pargs.num + self.directory = pargs.directory """ maybe something like if self.input_dir: @@ -216,13 +230,13 @@ def validate(self): print(self.output_df) """ - def saveConfigs(self, debug=False): + def saveConfigs(self, debug=False, pf_format=False): """ Iterate through methods and save to each file """ for k in self.methods: method = self.methods[k] - method.saveQt() + method.saveQt(pf_format=pf_format, directory=self.directory) def printConfigs(self): """ @@ -232,7 +246,10 @@ def printConfigs(self): raise ValueError("Method results not generated") for k in self.method_results: for dtype in self.method_results[k]: - print(f"dtype: {dtype}\n{self.method_results[k][dtype]}\n") + df = self.method_results[k][dtype] + df = df.astype(int) + print(f"dtype: {dtype}\n{df}\n") + def saveBest(self): """ @@ -311,7 +328,7 @@ def parseData(file): tile_params.columns = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] - tile_params = tile_params.drop(['param8','param9'], axis=1) + #tile_params = tile_params.drop(['param8','param9'], axis=1) tile_params['performance'] = data['performance'] @@ -1124,7 +1141,14 @@ def main(args=None): default=40, help='Number of perf configs to include') - + parser.add_argument('--perfconfig-format', + action='store_true', + default=False, + help='Save file in correct csv perfconfig format') + + parser.add_argument('--directory', + type=str, + help='Directory to store results to') pargs = parser.parse_args() @@ -1133,7 +1157,7 @@ def main(args=None): tuner.tune() if pargs.save: - tuner.saveConfigs() + tuner.saveConfigs(pf_format=pargs.perfconfig_format) if pargs.debug: tuner.printConfigs() From d3d6a0a2a8f4a7a1e79c73f855f8d79513b6200a Mon Sep 17 00:00:00 2001 From: Ethan Date: Fri, 19 Jul 2024 23:48:06 +0000 Subject: [PATCH 03/14] Fixed threshold values, need to add kwargs for methods. --- mlir/utils/performance/quickTunerGen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 0739088ce4a4..0aa2008662f4 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -966,13 +966,14 @@ class fairSelect(quickTunerMethod): """ take entire set and aggregate the repeats, averaging them out/ weighing them more heavily """ - def __init__(self, name=None, N=40, normalize=True): + def __init__(self, name=None, N=40, normalize=True, threshold=0.95): super().__init__(name, N) self.normalize = normalize + self.threshold # top 95 percent for efficiency def __get_top_90_percent(self, df): df_sorted = df.sort_values(by='performance', ascending=False) - return df_sorted[df_sorted['performance'] >= 0.95] + return df_sorted[df_sorted['performance'] >= self.threshold] def __combine_datasets(self, dfs): cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] From 81e292ab4670a98d9997a605b4e31e84efee39ff Mon Sep 17 00:00:00 2001 From: Ethan Date: Sat, 20 Jul 2024 16:41:50 +0000 Subject: [PATCH 04/14] Fixed threshold not being assigned --- mlir/utils/performance/quickTunerGen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 0aa2008662f4..b0bfea367f05 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -969,7 +969,7 @@ class fairSelect(quickTunerMethod): def __init__(self, name=None, N=40, normalize=True, threshold=0.95): super().__init__(name, N) self.normalize = normalize - self.threshold # top 95 percent for efficiency + self.threshold = threshold # top 95 percent for efficiency def __get_top_90_percent(self, df): df_sorted = df.sort_values(by='performance', ascending=False) From e05b6580f9dd19bce9d8b184cfce86c3664766cc Mon Sep 17 00:00:00 2001 From: Ethan Date: Mon, 22 Jul 2024 15:16:59 +0000 Subject: [PATCH 05/14] Changed indexing for input dataframe to be 'NormalizedTFlops' instead of 'TFlops' --- mlir/utils/performance/quickTunerGen.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index b0bfea367f05..ecd56d2dc2b6 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -450,7 +450,7 @@ def orderByType(combined_df: str, normalize=False): perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) - perf_configs['performance'] = final_df['TFlops'] + perf_configs['performance'] = final_df['NormalizedTFlops'] perf_configs['DataType'] = final_df['DataType'] @@ -484,7 +484,7 @@ def orderByGemmType(combined_df: str, normalize=True): perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) - perf_configs['performance'] = final_df['TFlops'] + perf_configs['performance'] = final_df['NormalizedTFlops'] perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) @@ -882,7 +882,7 @@ def __add_average_tflops(self, counted_perf,avrg_tflops): # get the perf config with the maximum tflops def __get_max_tflops_perfconfig(self, group): - max_index = group['TFlops'].idxmax() + max_index = group['NormalizedTFlops'].idxmax() max_row = group.loc[max_index] perf_config = max_row['PerfConfig'] group.drop(max_index, inplace=True) @@ -937,9 +937,9 @@ def __averagePerformance(self, combined_df): for perfconfig, group_df in fgroups: if problems_count < len(group_df): problems_count = len(group_df) - not_nan_count = pd.notna(group_df['TFlops']).sum() + not_nan_count = pd.notna(group_df['NormalizedTFlops']).sum() not_nan_counts[perfconfig] = not_nan_count - mean_tflops[perfconfig] = group_df['TFlops'].mean() + mean_tflops[perfconfig] = group_df['NormalizedTFlops'].mean() sorted_counts = sorted(not_nan_counts.items(), key=lambda x: x[1], reverse=True) top_perfconfigs = {perfconfig: mean_tflops[perfconfig] for perfconfig, count in sorted_counts if count == problems_count} result[data_type] = top_perfconfigs From 9d4fddf67d524611b19e1cc13a776cdb3a9f8489 Mon Sep 17 00:00:00 2001 From: Ethan Date: Mon, 22 Jul 2024 17:06:20 +0000 Subject: [PATCH 06/14] Added documentation for classes and cleaned up uncessary comments/ lines of code. --- mlir/utils/performance/quickTunerGen.py | 137 +++++++----------------- 1 file changed, 41 insertions(+), 96 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index ecd56d2dc2b6..a9dd0e743cc3 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -92,6 +92,8 @@ def saveQt(self, name=None, directory=None, debug=False, suffix=".qt", pf_format def savePerfConfig(self, name=None, dtype=None, prefix="v2:"): """ + Saves perf configuration in the 'standard' format that can be read + and accepted into other scripts or as an arg in rocmlir-gen """ type_df = self.config @@ -176,13 +178,6 @@ def __parseValidationArgs(self, pargs): else: raise ValueError(f"Argument {item} is not a valid key=value pair") - # leftovers from pargs.validate - #if pargs.validate and pargs.validate == 'data': - # init validator - # self.validator = dataValidator(pargs.input_file,**kwargs) - #else: - # self.validator = None - def addMethod(self, method: quickTunerMethod): """ Adds method to method dict @@ -190,6 +185,10 @@ def addMethod(self, method: quickTunerMethod): self.methods[method.name] = method def tune(self): + """ + tuner function that actually does the work of calling each registered + quickTunerMethod class's getConfig method to generated perf configs + """ self.method_results = {} if not self.methods: print("No methods are registered, use quickTuner.addMethod(method: quickTunerMethod), to add a method", file=sys.stderr) @@ -199,36 +198,6 @@ def tune(self): method = self.methods[k] df = method.getConfig(self.combined_df.copy()) self.method_results[k] = df - - """ moved to quickTunerStat.py - def validate(self): - #Validate on either a dataset or by running rocmlir-tuning-gen - if self.validator is None: - print("validator not set", file=sys.stderr) - return - output_dict = {} - for method in self.method_results: - # df will be of the form: {type1: [data], type2: [data], ..., typeN: [data]} - for dtype in self.method_results[method]: - if dtype not in output_dict: - output_dict[dtype] = {} - gemm_data = self.validator.validate(self.method_results[method][dtype], dtype) - - - for df in gemm_data: # for every gemm config we get data back - ct = 0 - max_values = [] - threshold = 0.92 - for df in gemm_data: - if (df['performance'].dropna() <= threshold).all(): - #print(f"{name} does not meet threshold (>0.8): {df}") - ct += 1 - #max_values.append(df[column].max()) - output_dict[dtype][method] = ct - - self.output_df = pd.DataFrame(output_dict) - print(self.output_df) - """ def saveConfigs(self, debug=False, pf_format=False): """ @@ -248,38 +217,7 @@ def printConfigs(self): for dtype in self.method_results[k]: df = self.method_results[k][dtype] df = df.astype(int) - print(f"dtype: {dtype}\n{df}\n") - - - def saveBest(self): - """ - Save the best method - """ - df = self.output_df - - min_values = df.min() - best_methods = df.idxmin() - - method_counts = best_methods.value_counts() - - max_count = method_counts.max() - majority_methods = method_counts[method_counts == max_count].index - - result_methods = {} - for col in df.columns: - candidates = df.loc[majority_methods, col] - result_methods[col] = candidates.idxmin() - - # Create a list of tuples with index and corresponding method - output = [(index, method) for index, method in result_methods.items()] - - for entry in output: - dtype, method = entry - self.methods[method].savePerfConfig(f"quick_tuning_{dtype}", dtype) - - - - + print(f"dtype: {dtype}\n{df}\n") """ Common methods @@ -289,7 +227,6 @@ def orderDict(type_dict: dict): """ order dictionary, removing nan along the way """ - for k,v in type_dict.items(): df = type_dict[k] @@ -519,7 +456,9 @@ def convertToConfig(type_df, filename, suffix=".qt", debug=False): class hardcodeQuickTune(quickTunerMethod): """ - Default quick tune method, uses preset values for the config file + Default quick tune method, uses preset values for the config + file as of 07/22/24 these are the values that are coded into + GridwiseGemmParams.cpp """ def __init__(self, name=None): super().__init__(name) @@ -563,9 +502,7 @@ def getConfig(self, input_file): return self.config """ - -Place derived quickTunerMethod classes below here: - +Place child quickTunerMethod classes below here: """ class topNSelection(quickTunerMethod): @@ -596,8 +533,10 @@ def getConfig(self, combined_df): class topMode(quickTunerMethod): """ - get most common of all gemms + Count occurrences of each perf config, take top most common + perf configs """ + def __init__(self, name=None, N=40, normalize=True): super().__init__(name, N) self.normalize = normalize @@ -636,14 +575,15 @@ def getConfig(self, input_file): class takeNEach(quickTunerMethod): + """ + take top performers from N dataframes + """ + def __init__(self, name=None, N=40, normalize=True): super().__init__(name, N) self.normalize = normalize def getConfig(self, combined_df): - """ - take top performers from N dataframes - """ config_dict = {} type_gemm_dict = orderByGemmType(input_file, normalize=self.normalize) @@ -680,9 +620,12 @@ def getConfig(self, combined_df): class topConfigCluster(quickTunerMethod): """ - Cluster each run, take sample from total + Cluster each run, take sample from total, + can be improved via new distance metric + for Kmeans clustering, alternatively + try with DBSCAN again """ - + def __init__(self, name=None, N=40, normalize=True): super().__init__(name, N) self.normalize = normalize @@ -724,12 +667,6 @@ def getConfig(self, combined_df): silhouette_scores.append((n_clusters, silhouette_avg)) df['cluster'] = mb_kmeans.fit_predict(features_scaled[features]) - #kmeans = KMeans(n_clusters=n_clusters) - #df['clusters'] = kmeans.fit_predict(features_scaled[features]) - # - #representative_set = df.groupby('cluster').apply(lambda x: x.sample(2)) - #print(representative_set) - # get optimal clusters optimal_n = max(silhouette_scores, key=lambda x: x[1])[0] @@ -740,12 +677,6 @@ def getConfig(self, combined_df): proportion = int(N // optimal_n) representative_set = df.groupby('cluster').apply(lambda x: x.nlargest(proportion, 'performance')).reset_index(drop=True) - # Sort each group by 'performance' in descending order and take the top 2 rows - #representative_set = df.groupby('cluster').apply(lambda x: x.nlargest(2, 'performance')).reset_index(drop=True) - #print(representative_set) - - #representative_set = representative_set.drop(['cluster'], axis=1) - result_dict[k] = representative_set.drop(['cluster'], axis=1) except Exception as e: print(f"Error processing type {k}: {e}", file=sys.stderr) @@ -964,8 +895,24 @@ def getConfig(self, combined_df): class fairSelect(quickTunerMethod): """ - take entire set and aggregate the repeats, averaging them out/ weighing them more heavily + take entire set and aggregate the repeats, averaging them out/ weighing them more heavily. + Breakdown of steps: + 1) for type and each gemm + 2) get the top 90% of each list, found by using min-max scalar + 3) sort the features by count (occurrences) and performance + 4) iterate over sorted feature and for each feature: + check if it has been added to the final dataset + if not find the dataframes that contain this feature + if none have been used add the feature to the final + dataset, mark feature as added, mark df as used. + if all dataframe have been used, break + 5) if any dataframes have not been represented yet, add top + performeres from each dataframe until all are represented + 6) fill any remaining performers until required size is met + or + cut down spacce (df.head(N)) """ + def __init__(self, name=None, N=40, normalize=True, threshold=0.95): super().__init__(name, N) self.normalize = normalize @@ -1162,9 +1109,7 @@ def main(args=None): if pargs.debug: tuner.printConfigs() - - - + if __name__ == '__main__': main(sys.argv[1:]) From c55384151e500edf207fb885f6e200f7bbb519cb Mon Sep 17 00:00:00 2001 From: Ethan Date: Tue, 30 Jul 2024 18:42:04 +0000 Subject: [PATCH 07/14] Added some changes to column names to include splitK and bCopyMore. --- mlir/utils/performance/quickTunerGen.py | 44 ++++++++++++------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index a9dd0e743cc3..1e8ffbd21dae 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -262,11 +262,8 @@ def parseData(file): tile_params = data['perf_config'].str.split(',', expand=True).astype(int) - tile_params.columns = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + tile_params.columns = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - - #tile_params = tile_params.drop(['param8','param9'], axis=1) - tile_params['performance'] = data['performance'] tile_params.replace('N/A', np.nan, inplace=True) @@ -379,14 +376,12 @@ def orderByType(combined_df: str, normalize=False): final_df = combined_df unique_data_types = final_df['DataType'].unique() - perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) perf_configs.columns = perf_config_cols - perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) - perf_configs['performance'] = final_df['NormalizedTFlops'] perf_configs['DataType'] = final_df['DataType'] @@ -413,14 +408,12 @@ def orderByGemmType(combined_df: str, normalize=True): target_cols = trans_cols + param_cols - perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) perf_configs.columns = perf_config_cols - perf_configs.drop(['param8', 'param9'], axis=1, inplace=True) - perf_configs['performance'] = final_df['NormalizedTFlops'] perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) @@ -447,7 +440,6 @@ def convertToConfig(type_df, filename, suffix=".qt", debug=False): df = type_df[t] if 'performance' in df.columns: df = df.drop(labels=['performance'], axis=1) - df['forceUnroll'] = 1 df = df.to_csv(fname, index=False) """ @@ -469,7 +461,9 @@ def __init__(self, name=None): "M/wave": [128, 128, 64, 128, 32, 64, 32, 32, 32, 128, 128, 32, 64, 64, 64, 32, 32, 32, 16, 32, 16, 32, 16, 64, 32, 16, 16, 16, 32, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16], "N/wave": [32, 32, 16, 32, 32, 16, 32, 16, 32, 32, 16, 16, 16, 32, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], "kPack": [4, 1, 4, 4, 8, 1, 4, 1, 4, 4, 4, 8, 4, 1, 4, 4, 8, 4, 4, 4, 8, 4, 8, 8, 8, 4, 4, 8, 1, 4, 4, 4, 8, 4, 8, 8, 4, 8, 4, 8], - "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], + "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] }) @@ -480,7 +474,9 @@ def __init__(self, name=None): "M/wave": [64, 64, 128, 64, 32, 32, 128, 128, 64, 64, 32, 128, 32, 32, 64, 32, 32, 32, 64, 32, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16], "N/wave": [32, 32, 32, 32, 32, 16, 32, 16, 32, 16, 32, 16, 32, 32, 16, 32, 16, 16, 32, 16, 32, 32, 32, 16, 16, 32, 32, 32, 16, 16, 32, 32, 16, 32, 16, 16, 16, 16, 16, 16], "kPack": [4, 8, 8, 4, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 4], - "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], + "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] }) self.default_i8 = pd.DataFrame({ @@ -490,7 +486,9 @@ def __init__(self, name=None): "M/wave": [128, 64, 128, 64, 32, 64, 32, 32, 32, 64, 32, 64, 32, 32, 32, 32, 32, 32, 32, 32, 16, 32, 16, 32, 32, 16, 32, 32, 32, 16, 32, 16, 32, 16, 16, 16, 16, 16, 16, 16], "N/wave": [16, 32, 16, 16, 16, 32, 32, 16, 16, 32, 16, 16, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], "kPack": [4, 8, 8, 8, 16, 4, 16, 16, 16, 4, 4, 8, 16, 8, 4, 16, 16, 16, 8, 4, 16, 4, 16, 16, 8, 16, 4, 8, 4, 4, 4, 16, 8, 4, 8, 8, 4, 16, 4, 4], - "forceUnroll": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], + "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] }) self.config = { 'f32': self.default_f32, 'f16': self.default_f16, 'i8': self.default_i8 } @@ -560,11 +558,11 @@ def getConfig(self, input_file): # now we have a list of the gemms in combined # remove any repetitions and order by appearance - grouped_df = df.groupby(['M/block','N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'], as_index=False).agg({'performance': 'count'}).rename(columns={'performance': 'count'}) + grouped_df = df.groupby(['M/block','N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'], as_index=False).agg({'performance': 'count'}).rename(columns={'performance': 'count'}) - result_df = pd.merge(df, grouped_df, on=['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll']) + result_df = pd.merge(df, grouped_df, on=['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore']) - final_df = result_df.loc[result_df.groupby(['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'])['performance'].idxmax()] + final_df = result_df.loc[result_df.groupby(['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'])['performance'].idxmax()] final_df = final_df.sort_values(by=['count', 'performance'], ascending=[False, False]) @@ -639,7 +637,7 @@ def getConfig(self, combined_df): result_dict = {} - features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] # now we have normalized data for k,df in type_dict.items(): @@ -647,7 +645,7 @@ def getConfig(self, combined_df): # cluster each type - features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'performance'] + features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore', 'performance'] scaler = StandardScaler() @@ -788,7 +786,7 @@ def __init__(self, name=None, N=40, normalize=True): def __data2df(self, data): def split_str(s): return s.split(':')[-1].split(',') - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll', 'param8', 'param9'] + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] df_dict = {} for k in data: for i,n in enumerate(split_str(k)): @@ -923,7 +921,7 @@ def __get_top_90_percent(self, df): return df_sorted[df_sorted['performance'] >= self.threshold] def __combine_datasets(self, dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] combined_df = pd.concat(dfs).sort_values(by='performance', ascending=False) combined_df = combined_df.drop_duplicates(subset=cols, keep='first') return combined_df @@ -948,7 +946,7 @@ def __aggregate_datasets(self, dfs): return feature_dict, count_dict, max_label_dict, df_dict def __balance_datasets(self, combined_df, original_dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] selected_features = set() balanced_dataset = [] @@ -979,7 +977,7 @@ def __balance_datasets(self, combined_df, original_dfs): return balanced_dataset_df def __build_final_df(self, top_dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'forceUnroll'] + cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] # aggregate common feature vectors feature_dict, count_dict, max_label_dict, df_dict = self.__aggregate_datasets(top_dfs) highest_perfs = self.__combine_datasets(top_dfs) From c7e8eaf268449fc0ea52162a04da1e4188b4ac2b Mon Sep 17 00:00:00 2001 From: Ethan Date: Sun, 4 Aug 2024 22:17:59 +0000 Subject: [PATCH 08/14] Added faulthandler and removed seaborn --- mlir/utils/performance/quickTunerGen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 1e8ffbd21dae..a104617deabe 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -33,11 +33,10 @@ from sklearn.metrics import silhouette_score from collections import defaultdict import matplotlib.pyplot as plt -import seaborn as sns - import faulthandler import re import glob +import traceback class quickTunerMethod(object): """ From 88665fae8c9c1f6e7af11af0b1ad0027426552a4 Mon Sep 17 00:00:00 2001 From: Ethan Date: Sun, 4 Aug 2024 22:26:53 +0000 Subject: [PATCH 09/14] Removed hard coded model --- mlir/utils/performance/quickTunerGen.py | 59 +------------------------ 1 file changed, 1 insertion(+), 58 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index a104617deabe..54279aa5cb28 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -441,63 +441,6 @@ def convertToConfig(type_df, filename, suffix=".qt", debug=False): df = df.drop(labels=['performance'], axis=1) df = df.to_csv(fname, index=False) -""" -Hardcoded tuner method -""" - -class hardcodeQuickTune(quickTunerMethod): - """ - Default quick tune method, uses preset values for the config - file as of 07/22/24 these are the values that are coded into - GridwiseGemmParams.cpp - """ - def __init__(self, name=None): - super().__init__(name) - self.default_f32 = pd.DataFrame({ - "M/block": [256, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], - "N/block": [256, 64, 128, 128, 128, 64, 64, 64, 64, 64, 32, 16, 256, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 16, 128, 128, 64, 64, 32, 32, 16, 16, 32, 32, 16, 16], - "K/block": [2, 8, 8, 4, 2, 8, 8, 8, 4, 2, 4, 4, 8, 4, 4, 4, 2, 8, 8, 8, 8, 4, 4, 8, 4, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 4, 8, 4, 8], - "M/wave": [128, 128, 64, 128, 32, 64, 32, 32, 32, 128, 128, 32, 64, 64, 64, 32, 32, 32, 16, 32, 16, 32, 16, 64, 32, 16, 16, 16, 32, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16], - "N/wave": [32, 32, 16, 32, 32, 16, 32, 16, 32, 32, 16, 16, 16, 32, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - "kPack": [4, 1, 4, 4, 8, 1, 4, 1, 4, 4, 4, 8, 4, 1, 4, 4, 8, 4, 4, 4, 8, 4, 8, 8, 8, 4, 4, 8, 1, 4, 4, 4, 8, 4, 8, 8, 4, 8, 4, 8], - "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], - "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] - }) - - - self.default_f16 = pd.DataFrame({ - "M/block": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], - "N/block": [256, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 32, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 32, 32, 16, 128, 64, 64, 32, 32, 16, 128, 32, 64, 32], - "K/block": [8, 4, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 8, 4, 8, 8, 8, 8, 4, 4, 8, 8, 8, 8, 4, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8], - "M/wave": [64, 64, 128, 64, 32, 32, 128, 128, 64, 64, 32, 128, 32, 32, 64, 32, 32, 32, 64, 32, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16], - "N/wave": [32, 32, 32, 32, 32, 16, 32, 16, 32, 16, 32, 16, 32, 32, 16, 32, 16, 16, 32, 16, 32, 32, 32, 16, 16, 32, 32, 32, 16, 16, 32, 32, 16, 32, 16, 16, 16, 16, 16, 16], - "kPack": [4, 8, 8, 4, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 8, 8, 8, 8, 8, 4], - "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], - "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] - }) - - self.default_i8 = pd.DataFrame({ - "M/block": [128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16], - "N/block": [256, 128, 128, 128, 128, 64, 64, 64, 64, 128, 128, 128, 128, 128, 64, 64, 64, 64, 64, 64, 64, 32, 32, 32, 32, 16, 256, 256, 128, 64, 64, 64, 64, 32, 32, 16, 64, 32, 16, 16], - "K/block": [8, 16, 8, 8, 8, 32, 8, 8, 4, 32, 16, 8, 4, 8, 16, 8, 8, 4, 4, 16, 16, 16, 8, 8, 8, 8, 16, 4, 32, 32, 16, 8, 4, 32, 16, 16, 16, 16, 32, 16], - "M/wave": [128, 64, 128, 64, 32, 64, 32, 32, 32, 64, 32, 64, 32, 32, 32, 32, 32, 32, 32, 32, 16, 32, 16, 32, 32, 16, 32, 32, 32, 16, 32, 16, 32, 16, 16, 16, 16, 16, 16, 16], - "N/wave": [16, 32, 16, 16, 16, 32, 32, 16, 16, 32, 16, 16, 16, 16, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], - "kPack": [4, 8, 8, 8, 16, 4, 16, 16, 16, 4, 4, 8, 16, 8, 4, 16, 16, 16, 8, 4, 16, 4, 16, 16, 8, 16, 4, 8, 4, 4, 4, 16, 8, 4, 8, 8, 4, 16, 4, 4], - "splitK": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "forceUnroll": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], - "bCopyMore": [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True] - }) - - self.config = { 'f32': self.default_f32, 'f16': self.default_f16, 'i8': self.default_i8 } - - def getConfig(self, input_file): - """ - returns the already made config - """ - return self.config - """ Place child quickTunerMethod classes below here: """ @@ -1067,7 +1010,7 @@ def main(args=None): parser.add_argument('--method', nargs='+', - choices=["default","topNSelect","topMode","takeNEach","fairSelect","hardcoded"], + choices=["default","topNSelect","topMode","takeNEach","fairSelect"], default=["default","fairSelect"], help='Select perfConfig gen selection method') From b246a6de74cdb05ff91b5c97b654f8e74eb45f8e Mon Sep 17 00:00:00 2001 From: Ethan Date: Mon, 5 Aug 2024 16:30:42 +0000 Subject: [PATCH 10/14] Updated fairSelect to use PerfConfig cols instead of splitting --- mlir/utils/performance/quickTunerGen.py | 89 ++++++++++--------------- 1 file changed, 36 insertions(+), 53 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 54279aa5cb28..1eac6b182d30 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -82,12 +82,12 @@ def saveQt(self, name=None, directory=None, debug=False, suffix=".qt", pf_format df = type_df[t] if 'performance' in df.columns: df = df.drop(labels=['performance'], axis=1) - df = df.astype(int) + #df = df.astype(int) header = True if pf_format: df = self.__perfconfig_formatter(df) header = False - df = df.to_csv(fname, index=False, header=header) + df = df.to_csv(fname, index=False) def savePerfConfig(self, name=None, dtype=None, prefix="v2:"): """ @@ -215,7 +215,7 @@ def printConfigs(self): for k in self.method_results: for dtype in self.method_results[k]: df = self.method_results[k][dtype] - df = df.astype(int) + #df = df.astype(int) print(f"dtype: {dtype}\n{df}\n") """ @@ -240,9 +240,9 @@ def orderGemmDict(type_gemm_dict: dict): for k, v in type_gemm_dict.items(): for sub_dict in v: df = v[sub_dict] - df = df.dropna(how='any') - - type_gemm_dict[k][sub_dict] = df.sort_values(by=['performance'], ascending=False, ignore_index=True) + df = df.dropna(subset='performance', how='any') + df = df.sort_values(by=['performance'], ascending=False, ignore_index=True) + type_gemm_dict[k][sub_dict] = df return type_gemm_dict @@ -340,36 +340,6 @@ def parseDir(input_file: str, normalize=True): return group_df -def parseDir2(input_file: str, normalize=True): - - df_dir = {} - - tsv_files = glob.glob(f"{input_file}/*.debug") - - for file in tsv_files: - df = pd.read_csv(file, sep='\t') - if normalize: - scaler = MinMaxScaler() - df['TFlops'] = scaler.fit_transform(df[['TFlops']]) - dfs.append(df) - - final_df = pd.concat(dfs, ignore_index=True) - - trans_cols = ['TransA', 'TransB'] - - param_cols = [ 'G', 'M', 'N','K'] - - final_df = final_df.astype({entry: bool for entry in trans_cols}) - - final_df = final_df.astype({entry: int for entry in param_cols}) - - target_cols = trans_cols + param_cols - - group_df = {dtype: df for dtype, df in final_df[target_cols].groupby('DataType')} - - return group_df - - def orderByType(combined_df: str, normalize=False): final_df = combined_df @@ -407,21 +377,25 @@ def orderByGemmType(combined_df: str, normalize=True): target_cols = trans_cols + param_cols - perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] + final_df['performance'] = final_df['NormalizedTFlops'] - perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) + #perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - perf_configs.columns = perf_config_cols + #perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) - perf_configs['performance'] = final_df['NormalizedTFlops'] + #perf_configs.columns = perf_config_cols + + #perf_configs['performance'] = final_df['NormalizedTFlops'] - perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) + #perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) - grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in perf_configs.groupby(['DataType'])} + grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in final_df.groupby(['DataType'])} + print(grouped.keys()) for k in grouped: group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} grouped[k] = group + print(group.keys()) return grouped @@ -865,7 +839,7 @@ def __get_top_90_percent(self, df): def __combine_datasets(self, dfs): cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] combined_df = pd.concat(dfs).sort_values(by='performance', ascending=False) - combined_df = combined_df.drop_duplicates(subset=cols, keep='first') + combined_df = combined_df.drop_duplicates(subset='PerfConfig', keep='first') return combined_df def __aggregate_datasets(self, dfs): @@ -878,8 +852,13 @@ def __aggregate_datasets(self, dfs): df_id = id(df) df_dict[df_id] = df for _, row in df.iterrows(): - feature_vector = tuple(row[:-1]) # get feature (all but performance) - label = row[-1] + print(row) + #feature_vector = tuple(row[:-1]) # get feature (all but performance) + feature_vector = row['PerfConfig'] + print(feature_vector) + print(row['PerfConfig']) + #exit(1) + label = row['performance'] feature_dict[feature_vector].append(df_id) count_dict[feature_vector] += 1 if feature_vector not in max_label_dict or label > max_label_dict[feature_vector]: @@ -919,11 +898,16 @@ def __balance_datasets(self, combined_df, original_dfs): return balanced_dataset_df def __build_final_df(self, top_dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] + #cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] # aggregate common feature vectors feature_dict, count_dict, max_label_dict, df_dict = self.__aggregate_datasets(top_dfs) + #print(feature_dict) + #print(count_dict) + #print(max_label_dict) + #print(df_dict) + highest_perfs = self.__combine_datasets(top_dfs) - + # sort features by count and max label sorted_features = sorted(count_dict.keys(), key=lambda x: (-count_dict[x], -max_label_dict[x])) @@ -931,7 +915,7 @@ def __build_final_df(self, top_dfs): final_dataset = [] added_features = set() used_dfs = set() - + for feature in sorted_features: if feature not in added_features: @@ -957,7 +941,7 @@ def __build_final_df(self, top_dfs): for df_id in diff: df = df_dict[df_id] for _, row in df.iterrows(): - feature = tuple(row[:-1]) + feature = row['PerfConfig'] if feature not in added_features: added_features.add(feature) final_dataset.append(feature) @@ -965,7 +949,7 @@ def __build_final_df(self, top_dfs): if len(final_dataset) < self.N: for _, row in highest_perfs.iterrows(): - feature = tuple(row[:-1]) + feature = row['PerfConfig'] if feature not in added_features: added_features.add(feature) final_dataset.append(feature) @@ -973,15 +957,14 @@ def __build_final_df(self, top_dfs): break # add more high performers - - return pd.DataFrame(final_dataset, columns=cols).head(self.N) # though this should really not be set + return pd.DataFrame(final_dataset).head(self.N) # though this should really not be set def getConfig(self, combined_df): config_dict = {} type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) - type_gemm_dict = orderGemmDict(type_gemm_dict) + #type_gemm_dict = orderGemmDict(type_gemm_dict) for dtype, dfs in type_gemm_dict.items(): From 327fd0b6ba382744a58733330481e7f815e21f64 Mon Sep 17 00:00:00 2001 From: Ethan Date: Mon, 5 Aug 2024 16:42:57 +0000 Subject: [PATCH 11/14] updated default method --- mlir/utils/performance/quickTunerGen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index 1eac6b182d30..ac31249c783b 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -801,8 +801,9 @@ def getConfig(self, combined_df): sorted_data[datatype] = sorted_configs df_dict = {} + for datatype, value in sorted_data.items(): - df_dict[datatype] = self.__data2df(value).head(self.N) + df_dict[datatype] = pd.DataFrame(value.keys()) self.config = df_dict return df_dict From b9835b90feba27fe1861808dc13b2f61aae45d38 Mon Sep 17 00:00:00 2001 From: Ethan Date: Mon, 5 Aug 2024 20:06:23 +0000 Subject: [PATCH 12/14] Changed output format and input format --- mlir/utils/performance/quickTunerGen.py | 128 ++++++------------------ 1 file changed, 31 insertions(+), 97 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index ac31249c783b..d204221c0399 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -18,9 +18,7 @@ os.environ['OPENBLAS_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' import sys - -sys.path.append('../..') - +import csv import argparse import pandas as pd import numpy as np @@ -80,14 +78,9 @@ def saveQt(self, name=None, directory=None, debug=False, suffix=".qt", pf_format if directory: fname = os.path.join(directory, fname) df = type_df[t] - if 'performance' in df.columns: - df = df.drop(labels=['performance'], axis=1) - #df = df.astype(int) - header = True - if pf_format: - df = self.__perfconfig_formatter(df) - header = False - df = df.to_csv(fname, index=False) + with open(fname, 'w') as f: + row = df['PerfConfig'] + f.write("\n".join(row)) def savePerfConfig(self, name=None, dtype=None, prefix="v2:"): """ @@ -215,7 +208,6 @@ def printConfigs(self): for k in self.method_results: for dtype in self.method_results[k]: df = self.method_results[k][dtype] - #df = df.astype(int) print(f"dtype: {dtype}\n{df}\n") """ @@ -258,7 +250,7 @@ def parseData(file): comment='#') data['perf_config'] = data['perf_config'].str.split(':').str[1] - + tile_params = data['perf_config'].str.split(',', expand=True).astype(int) tile_params.columns = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] @@ -345,21 +337,26 @@ def orderByType(combined_df: str, normalize=False): final_df = combined_df unique_data_types = final_df['DataType'].unique() - perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] + #perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) + #perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) - perf_configs.columns = perf_config_cols + #perf_configs.columns = perf_config_cols - perf_configs['performance'] = final_df['NormalizedTFlops'] + #perf_configs['performance'] = final_df['NormalizedTFlops'] - perf_configs['DataType'] = final_df['DataType'] + final_df['performance'] = final_df['NormalizedTFlops'] + + #perf_configs['DataType'] = final_df['DataType'] if normalize: scaler = MinMaxScaler() - perf_configs['performance'] = scaler.fit_transform(perf_configs[['performance']]) + #perf_configs['performance'] = scaler.fit_transform(perf_configs[['performance']]) + final_df['performance'] = scaler.fit_transform(final_df[['performance']]) - result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in perf_configs.groupby('DataType')} + #result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in perf_configs.groupby('DataType')} + + result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in final_df.groupby('DataType')} return result @@ -390,12 +387,10 @@ def orderByGemmType(combined_df: str, normalize=True): #perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in final_df.groupby(['DataType'])} - print(grouped.keys()) for k in grouped: group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} grouped[k] = group - print(group.keys()) return grouped @@ -429,9 +424,9 @@ def __init__(self, name=None, N=40, normalize=True): self.normalize = normalize def getConfig(self, combined_df): - type_dict = orderByType(input_file, normalize=self.normalize) + type_dict = orderByType(combined_df, normalize=self.normalize) # update this to not use columns - type_dict = orderDict(type_dict) + type_dict = orderDict(type_dict) # change this to not rely on columns config_dict = {} @@ -439,8 +434,7 @@ def getConfig(self, combined_df): num_segments = self.N // 2 seg_size = len(v) // num_segments selected_configs = pd.concat([v.iloc[i * seg_size:(i+1) * seg_size].head(2) for i in range(num_segments)]) - - config_dict[k] = selected_configs + config_dict[k] = selected_configs[['PerfConfig', 'performance']] self.config = config_dict return self.config @@ -474,11 +468,11 @@ def getConfig(self, input_file): # now we have a list of the gemms in combined # remove any repetitions and order by appearance - grouped_df = df.groupby(['M/block','N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'], as_index=False).agg({'performance': 'count'}).rename(columns={'performance': 'count'}) + grouped_df = df.groupby(['PerfConfig'], as_index=False).agg({'performance': 'count'}).rename(columns={'performance': 'count'}) - result_df = pd.merge(df, grouped_df, on=['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore']) + result_df = pd.merge(df, grouped_df, on=['PerfConfig']) - final_df = result_df.loc[result_df.groupby(['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'])['performance'].idxmax()] + final_df = result_df.loc[result_df.groupby(['PerfConfig'])['performance'].idxmax()] final_df = final_df.sort_values(by=['count', 'performance'], ascending=[False, False]) @@ -500,7 +494,7 @@ def __init__(self, name=None, N=40, normalize=True): def getConfig(self, combined_df): config_dict = {} - type_gemm_dict = orderByGemmType(input_file, normalize=self.normalize) + type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) type_gemm_dict = orderGemmDict(type_gemm_dict) @@ -532,7 +526,7 @@ def getConfig(self, combined_df): return self.config -class topConfigCluster(quickTunerMethod): +class topConfigCluster(quickTunerMethod): #disabled """ Cluster each run, take sample from total, can be improved via new distance metric @@ -547,7 +541,7 @@ def __init__(self, name=None, N=40, normalize=True): def getConfig(self, combined_df): N=self.N n_clusters = N//2 - type_dict = orderByType(input_file, normalize=self.normalize) + type_dict = orderByType(combined_df, normalize=self.normalize) type_dict = orderDict(type_dict) @@ -558,15 +552,11 @@ def getConfig(self, combined_df): # now we have normalized data for k,df in type_dict.items(): try: - # cluster each type - - features = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore', 'performance'] scaler = StandardScaler() features_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features) - features_scaled['performance'] = df['performance'] @@ -699,19 +689,6 @@ def __init__(self, name=None, N=40, normalize=True): self.normalize = normalize self.N - def __data2df(self, data): - def split_str(s): - return s.split(':')[-1].split(',') - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - df_dict = {} - for k in data: - for i,n in enumerate(split_str(k)): - col = cols[i] - if col not in df_dict: - df_dict[col] = [] - df_dict[col].append(int(n)) - return pd.DataFrame(df_dict) - def __get_value(self, data_dict, data_type, perfconfig): try: return data_dict[data_type][perfconfig] @@ -803,7 +780,7 @@ def getConfig(self, combined_df): df_dict = {} for datatype, value in sorted_data.items(): - df_dict[datatype] = pd.DataFrame(value.keys()) + df_dict[datatype] = pd.DataFrame(value.keys(), columns=['PerfConfig']) self.config = df_dict return df_dict @@ -838,7 +815,6 @@ def __get_top_90_percent(self, df): return df_sorted[df_sorted['performance'] >= self.threshold] def __combine_datasets(self, dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] combined_df = pd.concat(dfs).sort_values(by='performance', ascending=False) combined_df = combined_df.drop_duplicates(subset='PerfConfig', keep='first') return combined_df @@ -853,12 +829,8 @@ def __aggregate_datasets(self, dfs): df_id = id(df) df_dict[df_id] = df for _, row in df.iterrows(): - print(row) #feature_vector = tuple(row[:-1]) # get feature (all but performance) feature_vector = row['PerfConfig'] - print(feature_vector) - print(row['PerfConfig']) - #exit(1) label = row['performance'] feature_dict[feature_vector].append(df_id) count_dict[feature_vector] += 1 @@ -867,45 +839,10 @@ def __aggregate_datasets(self, dfs): return feature_dict, count_dict, max_label_dict, df_dict - def __balance_datasets(self, combined_df, original_dfs): - cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - selected_features = set() - balanced_dataset = [] - - for i in range(len(original_dfs)): - if len(balanced_dataset) >= 40: - break - df = original_dfs[i] - - for _, row in df.iterrows(): - feature_tuple = tuple(row[cols]) - - if feature_tuple not in selected_features: - selected_features.add(feature_tuple) - balanced_dataset.append(feature_tuple) - break - - for _, row in combined_df.iterrows(): - if len(balanced_dataset) >= 30: - break - feature_tuple = tuple(row[cols]) - - if feature_tuple not in selected_features: - selected_features.add(feature_tuple) - balanced_dataset.append(row) - - balanced_dataset_df = pd.DataFrame(balanced_dataset, columns=cols) - - return balanced_dataset_df - def __build_final_df(self, top_dfs): #cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] # aggregate common feature vectors feature_dict, count_dict, max_label_dict, df_dict = self.__aggregate_datasets(top_dfs) - #print(feature_dict) - #print(count_dict) - #print(max_label_dict) - #print(df_dict) highest_perfs = self.__combine_datasets(top_dfs) @@ -956,7 +893,6 @@ def __build_final_df(self, top_dfs): final_dataset.append(feature) if len(final_dataset) >= self.N: break - # add more high performers return pd.DataFrame(final_dataset).head(self.N) # though this should really not be set @@ -965,16 +901,14 @@ def getConfig(self, combined_df): type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) - #type_gemm_dict = orderGemmDict(type_gemm_dict) - - for dtype, dfs in type_gemm_dict.items(): - + for dtype, dfs in type_gemm_dict.items(): top_90_percent = [] for cfg in dfs: df = dfs[cfg] top_90_percent.append(self.__get_top_90_percent(df)) - - config_dict[dtype] = self.__build_final_df(top_90_percent) + df = self.__build_final_df(top_90_percent) + df.columns = ['PerfConfig'] + config_dict[dtype] = df self.config = config_dict return config_dict From fc8b5c7c953a014c7ed76b7ae507c9de83cb017b Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 8 Aug 2024 15:41:58 +0000 Subject: [PATCH 13/14] Added conv labels for columns --- mlir/utils/performance/quickTunerGen.py | 133 +++++++++++++----------- 1 file changed, 73 insertions(+), 60 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index d204221c0399..d266347315f7 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -40,10 +40,11 @@ class quickTunerMethod(object): """ base class for creating quick tuner methods, implement the getConfig() method. """ - def __init__(self, name=None, N=40): + def __init__(self, op, name=None, N=40): self.N = N self.config = None - if name is None: + self.op = op + if not name: self.name = self.__class__.__name__ else: self.name = name @@ -127,16 +128,9 @@ def __init__(self, pargs): self.methods = {} self.N = pargs.num self.directory = pargs.directory - """ - maybe something like - if self.input_dir: - self.combined_df = qtPreprocessor.process(self.input_dir) - else: - self.combined_df = self.input_file - """ + self.op = pargs.op self.input_file = pargs.input_file self.combined_df = pd.read_csv(self.input_file, sep='\t') - #self.__parseValidationArgs(pargs) uneeded self.__parseMethods(pargs) def __parseMethods(self, pargs): @@ -146,15 +140,15 @@ def __parseMethods(self, pargs): gen_methods = pargs.method for method in gen_methods: if method == 'default': - self.addMethod(defaultQuickTune(method, self.N)) + self.addMethod(defaultQuickTune(self.op, method, N=self.N)) elif method == 'topNSelect': - self.addMethod(topNSelection(method, self.N)) + self.addMethod(topNSelection(self.op, method, N=self.N)) elif method == 'topMode': - self.addMethod(topMode(method, self.N)) + self.addMethod(topMode(self.op, method, N=self.N)) elif method == 'takeNEach': - self.addMethod(takeNEach(method, self.N)) + self.addMethod(takeNEach(self.op, method, N=self.N)) elif method == 'fairSelect': - self.addMethod(fairSelect(method, self.N)) + self.addMethod(fairSelect(self.op, method, N=self.N)) else: raise ValueError(f"Unknown method: {method}") @@ -337,25 +331,12 @@ def orderByType(combined_df: str, normalize=False): final_df = combined_df unique_data_types = final_df['DataType'].unique() - #perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] - - #perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) - - #perf_configs.columns = perf_config_cols - - #perf_configs['performance'] = final_df['NormalizedTFlops'] - final_df['performance'] = final_df['NormalizedTFlops'] - #perf_configs['DataType'] = final_df['DataType'] - if normalize: scaler = MinMaxScaler() - #perf_configs['performance'] = scaler.fit_transform(perf_configs[['performance']]) final_df['performance'] = scaler.fit_transform(final_df[['performance']]) - #result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in perf_configs.groupby('DataType')} - result = {dtype: group.drop(['DataType'], axis=1) for dtype, group in final_df.groupby('DataType')} return result @@ -376,23 +357,32 @@ def orderByGemmType(combined_df: str, normalize=True): final_df['performance'] = final_df['NormalizedTFlops'] - #perf_config_cols = ['M/block', 'N/block', 'K/block', 'M/wave', 'N/wave', 'kPack', 'splitK', 'forceUnroll', 'bCopyMore'] + grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in final_df.groupby(['DataType'])} + + for k in grouped: + group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} + grouped[k] = group + + return grouped - #perf_configs = final_df['PerfConfig'].str.split(':').str[1].str.split(',', expand=True).astype(int) +def orderByConvType(combined_df: str, normalize=True): - #perf_configs.columns = perf_config_cols + final_df = combined_df - #perf_configs['performance'] = final_df['NormalizedTFlops'] + cols = ['N', 'C', 'K', 'Y', 'X', 'DilationH', 'DilationW', 'StrideH', 'StrideW', 'PaddingH', 'PaddingW'] + + final_df = final_df.astype({entry: int for entry in cols}) - #perf_configs = perf_configs.join(final_df[target_cols + ['DataType']]) + final_df['performance'] = final_df['NormalizedTFlops'] grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in final_df.groupby(['DataType'])} for k in grouped: - group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} + group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(cols)} grouped[k] = group return grouped + def convertToConfig(type_df, filename, suffix=".qt", debug=False): @@ -419,8 +409,8 @@ class topNSelection(quickTunerMethod): splits data by type then splits into certain percentage evenly, taking the top performers from each group """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) self.normalize = normalize def getConfig(self, combined_df): @@ -445,18 +435,22 @@ class topMode(quickTunerMethod): perf configs """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) + self.op = op self.normalize = normalize def getConfig(self, input_file): config_dict = {} - - type_gemm_dict = orderByGemmType(input_file, normalize=self.normalize) - type_gemm_dict = orderGemmDict(type_gemm_dict) + if self.op == 'gemm': + type_dict = orderByGemmType(input_file, normalize=self.normalize) + elif self.op == 'conv': + type_dict = orderByConvType(input_file, normalize=self.normalize) - for k, v in type_gemm_dict.items(): + type_dict = orderGemmDict(type_dict) + + for k, v in type_dict.items(): combined = [] for sub_key in v: df = v[sub_key] @@ -487,22 +481,30 @@ class takeNEach(quickTunerMethod): take top performers from N dataframes """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) + self.op = op self.normalize = normalize def getConfig(self, combined_df): config_dict = {} - - type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) - type_gemm_dict = orderGemmDict(type_gemm_dict) + + if self.op == 'gemm': + type_dict = orderByGemmType(combined_df, normalize=self.normalize) + elif self.op == 'conv': + type_dict = orderByConvType(combined_df, normalize=self.normalize) + + type_dict = orderByGemmType(combined_df, normalize=self.normalize) + + + type_dict = orderGemmDict(type_dict) # calculate size for amount to take N = self.N - for k, v in type_gemm_dict.items(): + for k, v in type_dict.items(): sub_dict_size = len(v) subset_size = N // sub_dict_size if subset_size == 0: @@ -534,8 +536,8 @@ class topConfigCluster(quickTunerMethod): #disabled try with DBSCAN again """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) self.normalize = normalize def getConfig(self, combined_df): @@ -597,8 +599,8 @@ class topGemmCluster(quickTunerMethod): would hopefully contribute to a similar perf config peformance. """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) self.normalize = normalize def getConfig(self, combined_df): @@ -684,8 +686,9 @@ class defaultQuickTune(quickTunerMethod): """ take entire set and aggregate the repeats, averaging them out/ weighing them more heavily """ - def __init__(self, name=None, N=40, normalize=True): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True): + super().__init__(op, name, N) + self.op = op self.normalize = normalize self.N @@ -716,7 +719,7 @@ def __analyzeData(self, combined_df, avrg_tfops_per_datatype): unique_data_types = final_df['DataType'].unique() # iterate through unique data type results = {} - operations = ['gemm'] + operations = [self.op] for data_type in unique_data_types: win_counts = {} for operation in operations: @@ -805,8 +808,9 @@ class fairSelect(quickTunerMethod): cut down spacce (df.head(N)) """ - def __init__(self, name=None, N=40, normalize=True, threshold=0.95): - super().__init__(name, N) + def __init__(self, op, name=None, N=40, normalize=True, threshold=0.95): + super().__init__(op, name, N) + self.op = op self.normalize = normalize self.threshold = threshold # top 95 percent for efficiency @@ -898,10 +902,13 @@ def __build_final_df(self, top_dfs): def getConfig(self, combined_df): config_dict = {} - - type_gemm_dict = orderByGemmType(combined_df, normalize=self.normalize) - for dtype, dfs in type_gemm_dict.items(): + if self.op == 'gemm': + type_dict = orderByGemmType(combined_df, normalize=self.normalize) + elif self.op == 'conv': + type_dict = orderByConvType(combined_df, normalize=self.normalize) + + for dtype, dfs in type_dict.items(): top_90_percent = [] for cfg in dfs: df = dfs[cfg] @@ -932,6 +939,12 @@ def main(args=None): default=["default","fairSelect"], help='Select perfConfig gen selection method') + parser.add_argument('--op', '--operation', + type=str, + choices=["gemm", "conv"], + default="gemm", + help='Operation (gemm or conv)') + parser.add_argument('--save', action='store_true', default=False, From 4c16fcd7ce132bd0d67003d39ee75e86115da90d Mon Sep 17 00:00:00 2001 From: Djordje Ramic Date: Fri, 4 Oct 2024 14:20:30 +0000 Subject: [PATCH 14/14] Minor fixes --- mlir/utils/performance/quickTunerGen.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/mlir/utils/performance/quickTunerGen.py b/mlir/utils/performance/quickTunerGen.py index d266347315f7..b32fb1251a46 100644 --- a/mlir/utils/performance/quickTunerGen.py +++ b/mlir/utils/performance/quickTunerGen.py @@ -5,11 +5,9 @@ as input. Needs the input to be a combined normalized dataframe (default from quickTunerPreproc.py) -Usage: clusterConfigs.py [-h] --input-file INPUT_FILE [--method {default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} [{default,topNSelect,topMode,takeNEach,fairSelect,hardcoded} ...]] [--save] [--debug] [--num NUM] [--perfconfig--format] - Example Usage: -python3 quickTunerGen.py --input-file TESTFILE.out --method fairSelect --save --debug --num 20 +python3 quickTunerGen.py --input-file TESTFILE.out --method fairSelect --save --debug --top 20 Will read TESTFILE.out then generate a quick tune list of length 20 for each datatype in TESTFILE.out. Will print these lists and save them to METHODNAME.DTYPE.qt. """ @@ -36,7 +34,7 @@ import glob import traceback -class quickTunerMethod(object): +class quickTunerMethod(): """ base class for creating quick tuner methods, implement the getConfig() method. """ @@ -55,7 +53,6 @@ def __perfconfig_formatter(self, df, prefix="v2:"): """ df.iloc[:, 0] = prefix + df.iloc[:, 0].astype(str) return df - def setN(self, N): """ @@ -119,14 +116,14 @@ def getConfig(self, combined_df): raise NotImplementedError() -class quickTuner(object): +class quickTuner(): """ quickTuner class to run quick tuning methods from, requires user to instantiate quickTuner object then register quickTunerMethod child classes, finally run tune() """ def __init__(self, pargs): self.methods = {} - self.N = pargs.num + self.N = pargs.top self.directory = pargs.directory self.op = pargs.op self.input_file = pargs.input_file @@ -313,11 +310,9 @@ def parseDir(input_file: str, normalize=True): final_df = input_file trans_cols = ['TransA', 'TransB'] - param_cols = [ 'G', 'M', 'N','K'] final_df = final_df.astype({entry: bool for entry in trans_cols}) - final_df = final_df.astype({entry: int for entry in param_cols}) target_cols = trans_cols + param_cols @@ -346,11 +341,9 @@ def orderByGemmType(combined_df: str, normalize=True): final_df = combined_df trans_cols = ['TransA', 'TransB'] - param_cols = [ 'G', 'M', 'N','K'] final_df = final_df.astype({entry: bool for entry in trans_cols}) - final_df = final_df.astype({entry: int for entry in param_cols}) target_cols = trans_cols + param_cols @@ -369,16 +362,16 @@ def orderByConvType(combined_df: str, normalize=True): final_df = combined_df - cols = ['N', 'C', 'K', 'Y', 'X', 'DilationH', 'DilationW', 'StrideH', 'StrideW', 'PaddingH', 'PaddingW'] + target_cols = ['InputLayout','N', 'C', 'K', 'Y', 'X', 'DilationH', 'DilationW', 'StrideH', 'StrideW', 'PaddingH', 'PaddingW'] - final_df = final_df.astype({entry: int for entry in cols}) + final_df = final_df.astype({entry: str for entry in target_cols}) final_df['performance'] = final_df['NormalizedTFlops'] grouped = {dtype[0]: df.drop('DataType', axis=1) for dtype, df in final_df.groupby(['DataType'])} for k in grouped: - group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(cols)} + group = {cols: df.drop(target_cols, axis=1) for cols, df in grouped[k].groupby(target_cols)} grouped[k] = group return grouped @@ -926,7 +919,7 @@ def main(args=None): if args is None: args = sys.argv[1:] - parser = argparse.ArgumentParser(prog='clusterConfigs.py', + parser = argparse.ArgumentParser(prog='quickTunerGen.py', description='Bunch together runs into a parallel dir named ../{DIR_NAME}-bunch') parser.add_argument('--input-file', @@ -955,7 +948,7 @@ def main(args=None): default=False, help='Print debug info, print config files to stdout') - parser.add_argument('--num', '-n', + parser.add_argument('--top', '-n', type=int, default=40, help='Number of perf configs to include')