diff --git a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py index 5496d6568..5c7e9136f 100644 --- a/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py +++ b/artifacts/FractalTensor/benchmarks/blocked_sparse_attention/tvm/bigbird_tvm.py @@ -245,7 +245,7 @@ def bigbird_wv_SPMM_reduce(N, L, B, H, W, R, G, dtype): verbose=2, ) - # tuner.tune(tune_option) + tuner.tune(tune_option) funcs = [] diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py index 1996f60df..6f14e63e8 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_pt.py @@ -52,10 +52,10 @@ class PytorchGrid(unittest.TestCase): OUTPUT_FILE = cmd_args.output_file DEFAULT_TEST = cmd_args.default_test - if OUTPUT_FILE: - with open(OUTPUT_FILE, 'w') as fout: - fout.write( - "depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)\n") + # if OUTPUT_FILE: + # with open(OUTPUT_FILE, 'w') as fout: + # fout.write( + # "depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)\n") LOG_DEBUG_INFO = 1 PROFILER_ENABLE = 0 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py index 0454d2b51..cc0a81201 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/gridlstm_tf.py @@ -4,6 +4,7 @@ import unittest import os import logging +import argparse import datetime import test_utils as tu @@ -17,6 +18,17 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0" +def str2bool(v): + if isinstance(v, bool): + return v + if v in ('True'): + return True + elif v in ('False'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + def parse_test_args(): parser = argparse.ArgumentParser(description='Girdlstm') parser.add_argument( @@ -26,67 +38,72 @@ def parse_test_args(): parser.add_argument( '--hidden_size', type=int, help='Hidden size', default=256) parser.add_argument('--depth', type=int, help='Depth size', default=4) + parser.add_argument( + '--output_file', type=str, help='Output file path', default=None) + parser.add_argument( + '--default_test', + type=str2bool, + help='Whether to run the default test', + default=False) return parser.parse_args() class TFGraphGridLSTM(unittest.TestCase): WARM_UP = 5 - ITERS = 10 + ITERS = 1 cmd_args = parse_test_args() SEQ_LEN = cmd_args.seq_len BATCH_SIZE = cmd_args.batch_size - HIDDEN_SIZE = cmd_args.hidden_size - DEPTH = cmd_args.depth + HIDDEN = cmd_args.hidden_size + NUM_LAYERS = cmd_args.depth + OUTPUT_FILE = cmd_args.output_file + DEFAULT_TEST = cmd_args.default_test + + # if OUTPUT_FILE: + # with open(OUTPUT_FILE, 'w') as fout: + # fout.write( + # "depth\t[seq_length, batch_size, hidden_size]\tTensorFlow(ms)\n") LOG_DEBUG_INFO = 1 PROFILER_ENABLE = 0 def setUp(self): tf.compat.v2.random.set_seed(1234) - self._init_logger() self.stddev = 1.0 / math.sqrt(TFGraphGridLSTM.HIDDEN) self.shape = (TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, TFGraphGridLSTM.HIDDEN) - def _init_logger(self): - self.logger = logging.getLogger() - logging.basicConfig( - level=(logging.DEBUG - if TFGraphGridLSTM.LOG_DEBUG_INFO else logging.INFO), - filename="grid_lstm_results_tensorflow_graph.txt", - filemode="w", - format="%(message)s") - - def _report(self, test_name, start): - """ - Args: - test_name (String): Name of the test. - start (String): Timestamp of the start time. - """ + def _report(self, test_name, test_case, start): + seq_len, batch_size, hidden, num_layers = test_case elapsed_time = time.time() - start average_time = elapsed_time / TFGraphGridLSTM.ITERS - seq_per_sec = ( - TFGraphGridLSTM.ITERS * TFGraphGridLSTM.BATCH_SIZE) / elapsed_time - self.logger.info(("|%s|%.4f\t|%.4f\t|%.4f|") % - (test_name, average_time, elapsed_time, seq_per_sec)) - print(( - "|test_name = %s|average_time = %.4f s|elapsed_time = %.4f s|seq_per_sec = %.4f|" - ) % (test_name, average_time, elapsed_time, seq_per_sec)) - - def _apply_forward(self, dev, test_name, model): + + print(f"\nbench-grid\tdepth\t{num_layers}\tseq_length\t{seq_len}\t" + f"batch_size\t{batch_size}\t" + f"hidden_size\t{hidden}\tTensorFlow(ms)\t{average_time * 1000}") + + if self.OUTPUT_FILE: + with open(self.OUTPUT_FILE, 'a') as fout: + fout.write( + f"{num_layers}\t[{seq_len}, {seq_len}, {batch_size}, {hidden}]\t" + f"{average_time * 1000}\n") + + def _apply_forward(self, dev, test_name, test_case, model): """Only Test the forward computation. Args: dev, String: Device that on which the test is running. cpu or gpu. test_name, String: Name of the test. model, Callable: The tested model. It should be a callable object. """ - + seq_len, batch_size, hidden, num_layers = test_case + shape = (seq_len, batch_size, hidden) + stddev = 1.0 / math.sqrt(hidden) with tf.device(tu.device(dev)): source = tf.random.uniform( - self.shape, minval=-self.stddev, maxval=self.stddev) + shape, minval=-stddev, maxval=stddev) target = tf.random.uniform( - self.shape, minval=-self.stddev, maxval=self.stddev) + shape, minval=-stddev, maxval=stddev) output = model(source, target) @@ -108,41 +125,97 @@ def _apply_forward(self, dev, test_name, model): if TFGraphGridLSTM.PROFILER_ENABLE: tf.profiler.experimental.stop() - self._report(test_name, start) + self._report(test_name, test_case, start) def test_fine_grained_op_lstm_forward(self): - for device in [ - "cpu", - "gpu", - ]: - model = FineGrainedOpGridLSTMNet( - TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, - TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, - TFGraphGridLSTM.HIDDEN) - self._apply_forward( - device, f"graph_finegrained_op_lstm_{device}_forward", model) - - def test_while_op_lstm_forward(self): - for device in [ - "cpu", - "gpu", - ]: - model = WhileOpGridLSTMNet( - TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, - TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, - TFGraphGridLSTM.HIDDEN) - self._apply_forward(device, - f"graph_while_op_lstm_{device}_forward", model) - - def test_base_while_op_lstm_forward(self): - for device in [ - "cpu", - "gpu", - ]: - model = BaseWhileOpGridLSTMNet(TFGraphGridLSTM.HIDDEN) - self._apply_forward( - device, f"graph_base_while_op_lstm_{device}_forward", model) - + if not self.DEFAULT_TEST: + for device in [ + # "cpu", + "gpu", + ]: + model = FineGrainedOpGridLSTMNet( + TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, + TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + TFGraphGridLSTM.HIDDEN) + test_case = [ + TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + TFGraphGridLSTM.HIDDEN, TFGraphGridLSTM.NUM_LAYERS + ] + self._apply_forward( + device, f"graph_finegrained_op_lstm_{device}_forward", test_case, model) + + # def test_while_op_lstm_forward(self): + # if not self.DEFAULT_TEST: + # for device in [ + # # "cpu", + # "gpu", + # ]: + # model = WhileOpGridLSTMNet( + # TFGraphGridLSTM.NUM_LAYERS, TFGraphGridLSTM.SEQ_LEN, + # TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + # TFGraphGridLSTM.HIDDEN) + # test_case = [ + # TFGraphGridLSTM.SEQ_LEN, TFGraphGridLSTM.BATCH_SIZE, + # TFGraphGridLSTM.HIDDEN, TFGraphGridLSTM.NUM_LAYERS + # ] + # self._apply_forward(device, + # f"graph_while_op_lstm_{device}_forward", test_case, model) + + # def test_base_while_op_lstm_forward(self): + # for device in [ + # # "cpu", + # "gpu", + # ]: + # model = BaseWhileOpGridLSTMNet(TFGraphGridLSTM.HIDDEN) + # self._apply_forward( + # device, f"graph_base_while_op_lstm_{device}_forward", model) + + def test_default_data(self): + if self.DEFAULT_TEST: + test_name = f"gridlstm_gpu_forward_TensorFlow" + print("default test:", test_name) + + def build_data(test_case): + seq_len, batch_size, hidden, num_layers = test_case + model = FineGrainedOpGridLSTMNet( + num_layers, seq_len, + seq_len, batch_size, + hidden) + return model + + test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [10, 32, 256, 32], + [10, 32, 512, 32], + [10, 32, 1024, 32], + + # scale with depth + [10, 32, 256, 1], + [10, 32, 256, 2], + [10, 32, 256, 4], + [10, 32, 256, 8], + [10, 32, 256, 16], + [10, 32, 256, 32], + [10, 32, 1024, 1], + [10, 32, 1024, 2], + [10, 32, 1024, 4], + [10, 32, 1024, 8], + [10, 32, 1024, 16], + [10, 32, 1024, 32], + + # scale with length + [5, 32, 256, 32], + [7, 32, 256, 32], + [10, 32, 256, 32], + [5, 32, 1024, 32], + [7, 32, 1024, 32], + [10, 32, 1024, 32], + ] + + for test_case in test_cases: + model = build_data(test_case) + self._apply_forward('gpu' ,test_name, test_case, model) if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh index c110c9541..62f4987d3 100755 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh @@ -3,13 +3,18 @@ seq_len=10 batch_size=32 +root_dir=$(pwd) + +log_dir="$root_dir/../../../../../logs" +echo $'depth\t[seq_length, batch_size, hidden_size]\tPyTorch(ms)' > $log_dir/pt_grid_lstm.tsv + # overall hiddens='256 512 1024' for hidden in $hiddens; do - python3 gridlstm_pt.py --seq_len=$seq_len \ - --batch_size=$batch_size \ - --hidden_size=$hidden \ - --depth=32 + python3 gridlstm_pt.py --seq_len $seq_len \ + --batch_size $batch_size \ + --hidden_size $hidden \ + --depth 32 --output_file $log_dir/pt_grid_lstm.tsv done # scale with depth @@ -17,21 +22,22 @@ depths='1 2 4 8 16 32' hiddens='256 1024' for hidden in $hiddens; do for depth in $depths; do - python3 gridlstm_pt.py --seq_len=$seq_len \ - --batch_size=$batch_size \ - --hidden_size=$hidden \ - --depth=$depth + python3 gridlstm_pt.py --seq_len $seq_len \ + --batch_size $batch_size \ + --hidden_size $hidden \ + --depth $depth --output_file $log_dir/pt_grid_lstm.tsv done done # scale with length lengths='5 7 10' hiddens='256 1024' -for length in $lengths; do - for hidden in $hiddens; do - python3 gridlstm_pt.py --seq_len=$seq_len \ - --batch_size=32 \ - --hidden_size=$hidden \ - --depth=32 + +for hidden in $hiddens; do + for length in $lengths; do + python3 gridlstm_pt.py --seq_len $length \ + --batch_size 32 \ + --hidden_size $hidden \ + --depth 32 --output_file $log_dir/pt_grid_lstm.tsv done -done +done \ No newline at end of file diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py index 1cd08d2d3..09c87fb73 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow_graph.py @@ -18,7 +18,6 @@ def setUp(self): self.stddev = 1.0 / math.sqrt(HIDDEN_SIZE) self.log_dir = '' - self.logger = init_logger(self.log_dir, 'tensorflow_drnn_graph.txt') def test_drnn_forward(self): shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) @@ -29,7 +28,7 @@ def test_drnn_forward(self): with tf.compat.v1.Session() as sess: for device in [ - 'cpu', + # 'cpu', '/device:GPU:0', ]: with tf.device(device): @@ -70,8 +69,12 @@ def test_drnn_forward(self): inputs: x_data, pads: padding_data }) + elapsed_time = time() - start + average_time = elapsed_time / ITERS test_name = f'TensorFlow_Stacked_DLSTM_graph_{device}' - report(test_name, start, self.logger) + test_case = [SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, NUM_LAYERS] + report(test_name, test_case, OUTPUT_FILE, average_time * 1000) + # report(test_name, start, self.logger) if __name__ == '__main__': diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py index 3e89519d9..b5a60b079 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_dilated_rnn/utils.py @@ -58,7 +58,7 @@ def parse_test_args(): if not DEFAULT_TEST: DILATION = DILATION[0:NUM_LAYERS] -ITERS = 10 +ITERS = 1 WARMUP = 5 LOG_DEBUG_INFO = 0 diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py index 54bb47a2b..dfa3e974f 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_eager.py @@ -175,6 +175,32 @@ def build_model(test_case): return GraphModeModel test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [64, 256, 256, 32], + [64, 256, 512, 32], + [64, 256, 1024, 32], + # scale with depth + [64, 256, 256, 1], + [64, 256, 256, 2], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 16], + [64, 256, 256, 32], + [64, 256, 1024, 1], + [64, 256, 1024, 2], + [64, 256, 1024, 4], + [64, 256, 1024, 8], + [64, 256, 1024, 16], + [64, 256, 1024, 32], + # scale with length + [32, 256, 256, 32], + [64, 256, 256, 32], + [128, 256, 256, 32], + [32, 256, 1024, 32], + [64, 256, 1024, 32], + [128, 256, 1024, 32], + # figure 2 [64, 256, 256, 1], [64, 256, 256, 4], [64, 256, 256, 8], @@ -182,7 +208,6 @@ def build_model(test_case): [64, 256, 256, 16], [64, 256, 256, 20], ] - if self.OUTPUT_FILE: with open(self.OUTPUT_FILE, 'w') as fout: fout.write( diff --git a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py index 54fd0d5f2..cd947e00b 100644 --- a/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py +++ b/artifacts/FractalTensor/benchmarks/rnn/baselines/stacked_lstm/stacked_lstm_tensorflow_graph.py @@ -195,6 +195,32 @@ def build_model2(test_case): return GraphModeModel test_cases = [ + # overall + # [seq_len, batch_size, hidden, num_layers] + [64, 256, 256, 32], + [64, 256, 512, 32], + [64, 256, 1024, 32], + # scale with depth + [64, 256, 256, 1], + [64, 256, 256, 2], + [64, 256, 256, 4], + [64, 256, 256, 8], + [64, 256, 256, 16], + [64, 256, 256, 32], + [64, 256, 1024, 1], + [64, 256, 1024, 2], + [64, 256, 1024, 4], + [64, 256, 1024, 8], + [64, 256, 1024, 16], + [64, 256, 1024, 32], + # scale with length + [32, 256, 256, 32], + [64, 256, 256, 32], + [128, 256, 256, 32], + [32, 256, 1024, 32], + [64, 256, 1024, 32], + [128, 256, 1024, 32], + # figure 2 [64, 256, 256, 1], [64, 256, 256, 4], [64, 256, 256, 8],