diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 49e8c4080..7a1b7ff4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -300,7 +300,7 @@ jobs: strategy: max-parallel: 1 matrix: - app: [hello_world, imatmul, fmatmul, iconv2d, fconv2d, fconv3d, jacobi2d, dropout, fft, dwt, exp, softmax, dotproduct, fdotproduct, pathfinder, roi_align] + app: [hello_world, imatmul, fmatmul, iconv2d, fconv2d, fconv3d, jacobi2d, dropout, fft, dwt, exp, softmax, dotproduct, fdotproduct, pathfinder, roi_align, lavamd] ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-apps"] steps: @@ -661,6 +661,11 @@ jobs: with: name: roi_align_roofline path: roi_align.png + - name: Upload the lavamd roofline + uses: actions/upload-artifact@v2 + with: + name: lavamd_roofline + path: lavamd.png #################### # Clean-up stage # diff --git a/CHANGELOG.md b/CHANGELOG.md index d1d57b547..48030ba6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Plot kernels-Vl performance plot - Print I$/D$ stall metrics - Add `spmv`, `conjugate_gradient`, and `gemv` kernels. + - Add lavaMD `app`, benchmark, and performance plot ### Changed diff --git a/apps/benchmarks/benchmark/lavamd.bmark b/apps/benchmarks/benchmark/lavamd.bmark new file mode 100644 index 000000000..86aabe631 --- /dev/null +++ b/apps/benchmarks/benchmark/lavamd.bmark @@ -0,0 +1,71 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Matteo Perotti + +#include "../kernel/lavamd.h" +#include "runtime.h" +#include "util.h" + +#ifndef SPIKE +#include "printf.h" +#else +#include +#endif + +#ifndef WARM_CACHES_ITER +#define WARM_CACHES_ITER 1 +#endif + +extern fp alpha; +extern uint64_t n_boxes; +extern uint64_t NUMBER_PAR_PER_BOX; + +extern box_str box_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR rv_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern fp qv_cpu_mem[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR fv_v_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR fv_s_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern nei_str nn_mem[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); + +void warm_caches(uint64_t heat) { + for (uint64_t k = 0; k < heat; ++k) + kernel_vec(alpha, n_boxes, box_cpu_mem, rv_cpu_mem, qv_cpu_mem, fv_s_cpu_mem, + NUMBER_PAR_PER_BOX); +} + +int main() { +#ifndef SPIKE + // Warm-up caches + warm_caches(WARM_CACHES_ITER); +#endif + + HW_CNT_READY; + start_timer(); + kernel_vec(alpha, n_boxes, box_cpu_mem, rv_cpu_mem, qv_cpu_mem, fv_v_cpu_mem, + NUMBER_PAR_PER_BOX); + stop_timer(); + HW_CNT_NOT_READY; + + int64_t runtime = get_timer(); + printf("[sw-cycles]: %ld\n", runtime); + + return 0; +} diff --git a/apps/benchmarks/kernel/lavamd.c b/apps/benchmarks/kernel/lavamd.c new file mode 120000 index 000000000..afd31151b --- /dev/null +++ b/apps/benchmarks/kernel/lavamd.c @@ -0,0 +1 @@ +../../lavamd/kernel/lavamd.c \ No newline at end of file diff --git a/apps/benchmarks/kernel/lavamd.h b/apps/benchmarks/kernel/lavamd.h new file mode 120000 index 000000000..855befc80 --- /dev/null +++ b/apps/benchmarks/kernel/lavamd.h @@ -0,0 +1 @@ +../../lavamd/kernel/lavamd.h \ No newline at end of file diff --git a/apps/benchmarks/lib/exp.h b/apps/benchmarks/lib/exp.h new file mode 120000 index 000000000..a3e98976a --- /dev/null +++ b/apps/benchmarks/lib/exp.h @@ -0,0 +1 @@ +../../exp/kernel/exp.h \ No newline at end of file diff --git a/apps/benchmarks/main.c b/apps/benchmarks/main.c index fe90838ad..405f33669 100644 --- a/apps/benchmarks/main.c +++ b/apps/benchmarks/main.c @@ -73,6 +73,9 @@ #elif defined(ROI_ALIGN) #include "benchmark/roi_align.bmark" +#elif defined(LAVAMD) +#include "benchmark/lavamd.bmark" + #else #error \ "Error, no kernel was specified. Please, run 'make bin/benchmarks ENV_DEFINES=-D${KERNEL}', where KERNEL contains the kernel to benchmark. For example: 'make bin/benchmarks ENV_DEFINES=-DIMATMUL'." diff --git a/apps/common/default_args.mk b/apps/common/default_args.mk index 6aa0edda6..d7f9abccc 100644 --- a/apps/common/default_args.mk +++ b/apps/common/default_args.mk @@ -33,3 +33,5 @@ def_args_roi_align ?= "1 32 4 4 4 2 2" def_args_spmv ?= "128 128 0.6" # Conjugate gradient size and steps def_args_conjugate_gradient ?= "128 0 0.5" +# box1d, particles_per_box, alpha, maxelm +def_args_lavamd ?= "2 32 0.5 128" diff --git a/apps/common/rivec/vector_defines.h b/apps/common/rivec/vector_defines.h index 6d17b14e1..ae9442565 100644 --- a/apps/common/rivec/vector_defines.h +++ b/apps/common/rivec/vector_defines.h @@ -6,6 +6,9 @@ // RISC-V VECTOR intrinsics mapping by Cristóbal Ramírez Lazo, "Barcelona 2019" +#ifndef _RIVEC_VECTOR_DEFINES_H_ +#define _RIVEC_VECTOR_DEFINES_H_ + #include "riscv_vector.h" /* @@ -123,8 +126,31 @@ #define _MM_VFLT_f64(op1, op2, vl) vmflt_vv_f64m1_b64(op1, op2, vl) #define _MM_VFLT_f32(op1, op2, vl) vmflt_vv_f32m1_b32(op1, op2, vl) +#define _MM_VFSGNJN_f64(op1, op2, vl) vfsgnjn_vv_f64m1(op1, op2, vl) +#define _MM_VFSGNJN_f32(op1, op2, vl) vfsgnjn_vv_f32m1(op1, op2, vl) + +#define _MM_REDSUM_f64(dest, vector, scalar, vl) \ + vfredusum_vs_f64m1_f64m1(dest, vector, scalar, vl) +#define _MM_REDSUM_f32(dest, vector, scalar, vl) \ + vfredusum_vs_f32m1_f32m1(dest, vector, scalar, vl) + +/* + Memory Ops Intrinsics +*/ + +#define _MM_LOAD_f64(base, vl) vle64_v_f64m1(base, vl) +#define _MM_LOAD_f32(base, vl) vle32_v_f32m1(base, vl) + +#define _MM_STORE_f64(base, value, vl) vse64_v_f64m1(base, value, vl) +#define _MM_STORE_f32(base, value, vl) vse32_v_f32m1(base, value, vl) + +#define _MM_LOAD_STRIDE_f64(base, bstride, vl) vlse64_v_f64m1(base, bstride, vl) +#define _MM_LOAD_STRIDE_f32(base, bstride, vl) vlse32_v_f32m1(base, bstride, vl) + /* Ancillary Defines */ #define FENCE() asm volatile("fence"); + +#endif diff --git a/apps/lavamd/LICENSE b/apps/lavamd/LICENSE new file mode 100644 index 000000000..99c97002f --- /dev/null +++ b/apps/lavamd/LICENSE @@ -0,0 +1,38 @@ +LICENSE TERMS + +Copyright (c)2008-2011 University of Virginia +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use this software or a modified version of it, please cite the most relevant among the following papers: + +- M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings +of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International +Symposium on Computer Architecture (ISCA), June 2010. + +- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron. +"Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium +on Workload Characterization, Oct 2009. + +- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization +for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International +Conference on Supercomputing (ICS), June 2009. + +- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems +Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International +Symposium on Computer Architecture (ISCA), June 2009. + +- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA: +A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel +and Distributed Processing Symposium (IPDPS), May 2009. + +- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance +Study of General Purpose Applications on Graphics Processors using CUDA" Journal of +Parallel and Distributed Computing, Elsevier, June 2008. diff --git a/apps/lavamd/LICENSE_1 b/apps/lavamd/LICENSE_1 new file mode 100644 index 000000000..932cdc0f5 --- /dev/null +++ b/apps/lavamd/LICENSE_1 @@ -0,0 +1,28 @@ +Copyright (c) 2020, Barcelona Supercomputing Center +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer; +redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution; +neither the name of the copyright holders nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use this software or a modified version of it for your research, please cite the paper: +Cristóbal Ramírez, César Hernandez, Oscar Palomar, Osman Unsal, Marco Ramírez, and Adrián Cristal. 2020. A RISC-V Simulator and Benchmark Suite for Designing and Evaluating Vector Architectures. ACM Trans. Archit. Code Optim. 17, 4, Article 38 (October 2020), 29 pages. https://doi.org/10.1145/3422667 diff --git a/apps/lavamd/kernel/lavamd.c b/apps/lavamd/kernel/lavamd.c new file mode 100644 index 000000000..8d66dcf09 --- /dev/null +++ b/apps/lavamd/kernel/lavamd.c @@ -0,0 +1,346 @@ +// See LICENSE and LICENSE_1 for licensing terms of the original +// and vectorized version, respectively. + +/************************************************************************* + * RISC-V Vectorized Version + * Author: Cristóbal Ramírez Lazo + * email: cristobal.ramirez@bsc.es + * Barcelona Supercomputing Center (2020) + *************************************************************************/ + +// Modifications + Fixes to the vectorized version by: +// Matteo Perotti + +#include "lavamd.h" +#include "stdio.h" +void kernel(fp alpha, uint64_t n_boxes, box_str *box, FOUR_VECTOR *rv, fp *qv, + FOUR_VECTOR *fv, uint64_t NUMBER_PAR_PER_BOX) { + + /////////////// + // Variables // + /////////////// + + // parameters + fp a2; + + // counters + uint64_t i, j, k, l; + + // home box + long first_i; + FOUR_VECTOR *rA; + FOUR_VECTOR *fA; + + // neighbor box + int pointer; + long first_j; + FOUR_VECTOR *rB; + fp *qB; + + // common + fp r2; + fp u2; + fp fs; + fp vij; + fp fxij, fyij, fzij; + THREE_VECTOR d; + + ///////////////// + // MCPU SETUP // + ///////////////// + + // Inputs + a2 = 2.0 * alpha * alpha; + + ////////////////////////// + // Process interactions // + ////////////////////////// +#ifndef PSEUDO_LAVAMD + for (l = 0; l < n_boxes; ++l) { +#else + for (l = 0; l < 1; ++l) { +#endif + + //--------------------------- + // home box - box parameters + //--------------------------- + + // offset to common arrays + first_i = box[l].offset; + + //--------------------------- + // home box - distance, force, charge and type parameters from common + // arrays + //--------------------------- + + rA = &rv[first_i]; + fA = &fv[first_i]; + + //--------------------------- + // Do for the # of (home+neighbor) boxes + //--------------------------- +#ifndef PSEUDO_LAVAMD + for (k = 0; k < (uint64_t)(1 + box[l].nn); ++k) { +#else + for (k = 0; k < 2; ++k) { +#endif + //----------------------- + // neighbor box - get pointer to the right box + //----------------------- + + if (k == 0) { + // set first box to be processed to home box + pointer = l; + } else { + // remaining boxes are neighbor boxes + pointer = box[l].nei[k - 1].number; + } + + //----------------------- + // neighbor box - box parameters + //----------------------- + + first_j = box[pointer].offset; + + //----------------------- + // neighbor box - distance, force, charge and type parameters + //----------------------- + + rB = &rv[first_j]; + qB = &qv[first_j]; + + //----------------------- + // Do for the # of particles in home box + //----------------------- + +#ifndef PSEUDO_LAVAMD + for (i = 0; i < NUMBER_PAR_PER_BOX; ++i) { +#else + for (i = 0; i < 4; ++i) { +#endif + // do for the # of particles in current (home or neighbor) box + for (j = 0; j < NUMBER_PAR_PER_BOX; ++j) { + // coefficients + r2 = rA[i].v + rB[j].v - DOT(rA[i], rB[j]); + u2 = a2 * r2; + vij = exp(-u2); + fs = 2. * vij; + d.x = rA[i].x - rB[j].x; + d.y = rA[i].y - rB[j].y; + d.z = rA[i].z - rB[j].z; + fxij = fs * d.x; + fyij = fs * d.y; + fzij = fs * d.z; + + // forces + fA[i].v += qB[j] * vij; + fA[i].x += qB[j] * fxij; + fA[i].y += qB[j] * fyij; + fA[i].z += qB[j] * fzij; + } + } + } + } +} + +void kernel_vec(fp alpha, uint64_t n_boxes, box_str *box, FOUR_VECTOR *rv, + fp *qv, FOUR_VECTOR *fv, uint64_t NUMBER_PAR_PER_BOX) { + + //================ + // Variables + //================ + + // parameters + fp a2; + + // counters + uint64_t i, j, k, l; + + // home box + long first_i; + FOUR_VECTOR *rA; + FOUR_VECTOR *fA; + + // neighbor box + int pointer; + long first_j; + FOUR_VECTOR *rB; + fp *qB; + + //============== + // INPUTS + //============== + + a2 = 2.0 * alpha * alpha; + + //=========================== + // PROCESS INTERACTIONS + //=========================== + +#ifndef PSEUDO_LAVAMD + for (l = 0; l < n_boxes; ++l) { +#else + for (l = 0; l < 1; ++l) { +#endif + + //----------------------------- + // home box - box parameters + //----------------------------- + + first_i = box[l].offset; // offset to common arrays + + //--------------------------------------------------------------------- + // home box - distance, force, charge and type parameters from common + // arrays + //--------------------------------------------------------------------- + + rA = &rv[first_i]; + fA = &fv[first_i]; + + //----------------------------------------- + // Do for the # of (home+neighbor) boxes + //----------------------------------------- +#ifndef PSEUDO_LAVAMD + for (k = 0; k < (uint64_t)(1 + box[l].nn); k++) { +#else + for (k = 0; k < 2; k++) { +#endif + + //--------------------------------------- + // neighbor box - get pointer to the right box + //--------------------------------------- + + if (k == 0) { + pointer = l; // set first box to be processed to home box + } else { + pointer = + box[l].nei[k - 1].number; // remaining boxes are neighbor boxes + } + + //--------------------------------------- + // neighbor box - box parameters + //--------------------------------------- + + first_j = box[pointer].offset; + + //--------------------------------------- + // neighbor box - distance, force, charge and type parameters + //--------------------------------------- + + rB = &rv[first_j]; + qB = &qv[first_j]; + + //--------------------------------------- + // Do for the # of particles in home box + //--------------------------------------- +#ifndef PSEUDO_LAVAMD + for (i = 0; i < NUMBER_PAR_PER_BOX; ++i) { +#else + for (i = 0; i < 4; ++i) { +#endif + + unsigned long int gvl = vsetvl_e32m1(NUMBER_PAR_PER_BOX); + + _MMR_f32 xr2; + _MMR_f32 xDOT; + _MMR_f32 xu2; + _MMR_f32 xa2 = _MM_SET_f32(a2, gvl); + _MMR_f32 xvij; + _MMR_f32 xrA_v = _MM_SET_f32(rA[i].v, gvl); + _MMR_f32 xrA_x = _MM_SET_f32(rA[i].x, gvl); + _MMR_f32 xrA_y = _MM_SET_f32(rA[i].y, gvl); + _MMR_f32 xrA_z = _MM_SET_f32(rA[i].z, gvl); + _MMR_f32 xrB_v; + _MMR_f32 xrB_x; + _MMR_f32 xrB_y; + _MMR_f32 xrB_z; + _MMR_f32 xd_x; + _MMR_f32 xd_y; + _MMR_f32 xd_z; + _MMR_f32 xfxij; + _MMR_f32 xfyij; + _MMR_f32 xfzij; + _MMR_f32 xfs; + _MMR_f32 xqB; + _MMR_f32 xfA_v = _MM_SET_f32(0.0, gvl); + _MMR_f32 xfA_x = _MM_SET_f32(0.0, gvl); + _MMR_f32 xfA_y = _MM_SET_f32(0.0, gvl); + _MMR_f32 xfA_z = _MM_SET_f32(0.0, gvl); + _MMR_f32 xfA_1_v = _MM_SET_f32(0.0, 1); + _MMR_f32 xfA_1_x = _MM_SET_f32(0.0, 1); + _MMR_f32 xfA_1_y = _MM_SET_f32(0.0, 1); + _MMR_f32 xfA_1_z = _MM_SET_f32(0.0, 1); + + // do for the # of particles in current (home or neighbor) box + for (j = 0; j < NUMBER_PAR_PER_BOX; j += gvl) { + gvl = vsetvl_e32m1(NUMBER_PAR_PER_BOX - j); + // coefficients + xrB_v = _MM_LOAD_STRIDE_f32(&rB[j].v, 16, gvl); + xrB_x = _MM_LOAD_STRIDE_f32(&rB[j].x, 16, gvl); + xrB_y = _MM_LOAD_STRIDE_f32(&rB[j].y, 16, gvl); + xrB_z = _MM_LOAD_STRIDE_f32(&rB[j].z, 16, gvl); + // r2 = rA[i].v + rB[j].v - DOT(rA[i],rB[j]); + xr2 = _MM_ADD_f32(xrA_v, xrB_v, gvl); + xDOT = _MM_MUL_f32(xrA_x, xrB_x, gvl); + xDOT = _MM_MACC_f32(xDOT, xrA_y, xrB_y, gvl); + xDOT = _MM_MACC_f32(xDOT, xrA_z, xrB_z, gvl); + xr2 = _MM_SUB_f32(xr2, xDOT, gvl); + // u2 = a2*r2; + xu2 = _MM_MUL_f32(xa2, xr2, gvl); + // vij= exp(-u2); + xvij = __exp_2xf32(_MM_VFSGNJN_f32(xu2, xu2, gvl), gvl); + + if (k && (j + gvl) >= NUMBER_PAR_PER_BOX) { + // Accumulate final results + xfA_1_v = _MM_LOAD_f32(&fA[i].v, 1); + xfA_1_x = _MM_LOAD_f32(&fA[i].x, 1); + } + + // fs = 2.*vij; + xfs = _MM_MUL_f32(_MM_SET_f32(2.0f, gvl), xvij, gvl); + // d.x = rA[i].x - rB[j].x; + xd_x = _MM_SUB_f32(xrA_x, xrB_x, gvl); + // d.y = rA[i].y - rB[j].y; + xd_y = _MM_SUB_f32(xrA_y, xrB_y, gvl); + + if (k && (j + gvl) >= NUMBER_PAR_PER_BOX) { + xfA_1_y = _MM_LOAD_f32(&fA[i].y, 1); + xfA_1_z = _MM_LOAD_f32(&fA[i].z, 1); + } + + // d.z = rA[i].z - rB[j].z; + xd_z = _MM_SUB_f32(xrA_z, xrB_z, gvl); + // fxij=fs*d.x; + xfxij = _MM_MUL_f32(xfs, xd_x, gvl); + // fyij=fs*d.y; + xfyij = _MM_MUL_f32(xfs, xd_y, gvl); + // fzij=fs*d.z; + xfzij = _MM_MUL_f32(xfs, xd_z, gvl); + + // forces + // fA[i].v += qB[j]*vij; + // fA[i].x += qB[j]*fxij; + // fA[i].y += qB[j]*fyij; + // fA[i].z += qB[j]*fzij; + gvl = vsetvl_e32m1(NUMBER_PAR_PER_BOX); + xqB = _MM_LOAD_f32(&qB[j], gvl); + xfA_v = _MM_MACC_f32(xfA_v, xqB, xvij, gvl); + xfA_x = _MM_MACC_f32(xfA_x, xqB, xfxij, gvl); + xfA_y = _MM_MACC_f32(xfA_y, xqB, xfyij, gvl); + xfA_z = _MM_MACC_f32(xfA_z, xqB, xfzij, gvl); + } + + gvl = vsetvl_e32m1(NUMBER_PAR_PER_BOX); + + xfA_1_v = _MM_REDSUM_f32(xfA_1_v, xfA_v, xfA_1_v, gvl); + xfA_1_x = _MM_REDSUM_f32(xfA_1_x, xfA_x, xfA_1_x, gvl); + xfA_1_y = _MM_REDSUM_f32(xfA_1_y, xfA_y, xfA_1_y, gvl); + xfA_1_z = _MM_REDSUM_f32(xfA_1_z, xfA_z, xfA_1_z, gvl); + _MM_STORE_f32(&fA[i].v, xfA_1_v, 1); + _MM_STORE_f32(&fA[i].x, xfA_1_x, 1); + _MM_STORE_f32(&fA[i].y, xfA_1_y, 1); + _MM_STORE_f32(&fA[i].z, xfA_1_z, 1); + } + } + } +} diff --git a/apps/lavamd/kernel/lavamd.h b/apps/lavamd/kernel/lavamd.h new file mode 100644 index 000000000..2d6a18210 --- /dev/null +++ b/apps/lavamd/kernel/lavamd.h @@ -0,0 +1,74 @@ +// See LICENSE and LICENSE_1 for licensing terms of the original +// and vectorized version, respectively. + +/************************************************************************* + * RISC-V Vectorized Version + * Author: Cristóbal Ramírez Lazo + * email: cristobal.ramirez@bsc.es + * Barcelona Supercomputing Center (2020) + *************************************************************************/ + +// Modifications + Fixes to the vectorized version by: +// Matteo Perotti + +#ifndef _LAVAMD_H_ +#define _LAVAMD_H_ + +#include "../lib/exp.h" +#include "rivec/vector_defines.h" +#include +#include + +// This macro simplifies the program for benchmarking purposes, +// by removing the outer loops of the kernel and shortening the +// semi-last loop. +// Approximately all the time spent in executing the program is +// in the inner nested loop. The one immediatly before it, also +// accounts for some of the time. The rest is negligible. +// Use this macro to save time while benchmarking. +#define PSEUDO_LAVAMD 1 + +#define fp float + +#define DOT(A, B) ((A.x) * (B.x) + (A.y) * (B.y) + (A.z) * (B.z)) + +typedef struct __attribute__((__packed__)) { + + fp x, y, z; + +} THREE_VECTOR; + +typedef struct __attribute__((__packed__)) { + + fp v, x, y, z; + +} FOUR_VECTOR; + +typedef struct __attribute__((__packed__)) nei_str { + + // neighbor box + int x, y, z; + int number; + long offset; + +} nei_str; + +typedef struct __attribute__((__packed__)) box_str { + + // home box + int x, y, z; + int number; + long offset; + // neighbor boxes + int nn; + nei_str nei[26]; +} box_str; + +void kernel(fp alpha, uint64_t n_boxes, box_str *box, FOUR_VECTOR *rv, fp *qv, + FOUR_VECTOR *fv, uint64_t NUMBER_PAR_PER_BOX); +void kernel_vec(fp alpha, uint64_t n_boxes, box_str *box, FOUR_VECTOR *rv, + fp *qv, FOUR_VECTOR *fv, uint64_t NUMBER_PAR_PER_BOX); + +#define THRESHOLD 0.001 + +#endif diff --git a/apps/lavamd/lib/exp.h b/apps/lavamd/lib/exp.h new file mode 120000 index 000000000..a3e98976a --- /dev/null +++ b/apps/lavamd/lib/exp.h @@ -0,0 +1 @@ +../../exp/kernel/exp.h \ No newline at end of file diff --git a/apps/lavamd/main.c b/apps/lavamd/main.c new file mode 100644 index 000000000..b4dcddc68 --- /dev/null +++ b/apps/lavamd/main.c @@ -0,0 +1,95 @@ +// See LICENSE and LICENSE_1 for licensing terms of the original +// and vectorized version, respectively. + +/************************************************************************* + * RISC-V Vectorized Version + * Author: Cristóbal Ramírez Lazo + * email: cristobal.ramirez@bsc.es + * Barcelona Supercomputing Center (2020) + *************************************************************************/ + +// Modifications + Fixes to the vectorized version by: +// Matteo Perotti + +#include "kernel/lavamd.h" +#include "runtime.h" +#include "util.h" + +#ifndef SPIKE +#include "printf.h" +#else +#include +#endif + +extern fp alpha; +extern uint64_t n_boxes; +extern uint64_t NUMBER_PAR_PER_BOX; + +extern box_str box_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR rv_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern fp qv_cpu_mem[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR fv_s_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern FOUR_VECTOR fv_v_cpu_mem[] + __attribute__((aligned(4 * NR_LANES), section(".l2"))); +extern nei_str nn_mem[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); + +int main() { + + printf("\n"); + printf("=============\n"); + printf("= LAVA-MD =\n"); + printf("=============\n"); + printf("\n"); + printf("\n"); + + int err = 0; + + printf("n_boxes = %u, NUMBER_PAR_PER_BOX = %u\n", n_boxes, + NUMBER_PAR_PER_BOX); +#ifdef DEBUG + printf("sizeof(box_cpu_mem[0]) = %u\n", sizeof(box_cpu_mem[0])); + + for (uint64_t i = 0; i < n_boxes; i++) { + printf("box_cpu_mem[%d].offset = %u, while .number == %u\n", i, + box_cpu_mem[i].offset, box_cpu_mem[i].number); + } +#endif + + printf("Running the scalar benchmark.\n"); + kernel(alpha, n_boxes, box_cpu_mem, rv_cpu_mem, qv_cpu_mem, fv_s_cpu_mem, + NUMBER_PAR_PER_BOX); + + printf("Pre vec kernel s == %x, v == %x\n", + *((uint32_t *)&(fv_s_cpu_mem[0].v)), + *((uint32_t *)&(fv_v_cpu_mem[0].v))); + + printf("Running the vector benchmark.\n"); + kernel_vec(alpha, n_boxes, box_cpu_mem, rv_cpu_mem, qv_cpu_mem, fv_v_cpu_mem, + NUMBER_PAR_PER_BOX); + + printf("s == %x, v == %x\n", *((uint32_t *)&(fv_s_cpu_mem[0].v)), + *((uint32_t *)&(fv_v_cpu_mem[0].v))); + + // Check + for (uint64_t i = 0; i < n_boxes; ++i) { + if (!similarity_check_32b(fv_s_cpu_mem[i].v, fv_v_cpu_mem[i].v, + THRESHOLD) || + !similarity_check_32b(fv_s_cpu_mem[i].x, fv_v_cpu_mem[i].x, + THRESHOLD) || + !similarity_check_32b(fv_s_cpu_mem[i].y, fv_v_cpu_mem[i].y, + THRESHOLD) || + !similarity_check_32b(fv_s_cpu_mem[i].z, fv_v_cpu_mem[i].z, + THRESHOLD)) { + printf("Error at index %lu. s: %f != v: %f \n", i, fv_s_cpu_mem[i], + fv_v_cpu_mem[i]); + err = i ? i : -1; + } + } + if (!err) + printf("Test passed. No errors found.\n"); + + return err; +} diff --git a/apps/lavamd/script/gen_data.py b/apps/lavamd/script/gen_data.py new file mode 100644 index 000000000..0528a791e --- /dev/null +++ b/apps/lavamd/script/gen_data.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# arg1: box grid 1-dim size, arg2: alpha + +import random as rand +import numpy as np +import sys + +def emit(name, array, alignment='8'): + print(".global %s" % name) + print(".balign " + alignment) + print("%s:" % name) + bs = array.tobytes() + for i in range(0, len(bs), 4): + s = "" + for n in range(4): + s += "%02x" % bs[i+3-n] + print(" .word 0x%s" % s) + +############ +## SCRIPT ## +############ + +if len(sys.argv) == 5: + boxes1d = int(sys.argv[1]) + par4box = int(sys.argv[2]) + alpha = sys.argv[3] + maxelm = int(sys.argv[4]) +else: + print("Error. Give me four arguments: one-dimension size of the grid, the number of particles per box, alpha, and the maximum length of one vector register for the current LMUL configuration.") + sys.exit() + +dtype=np.float32 + +# Constants +NUMBER_PAR_PER_BOX = par4box + +########################## +## Dimension and memory ## +########################## + +n_boxes = boxes1d**3 +n_elm = n_boxes * NUMBER_PAR_PER_BOX +mem_elm = n_elm * dtype(0).itemsize * 4 +mem2_elm = n_elm * dtype(0).itemsize +# box_str is composed of 5 int, 1 long, 1 pointer +mem_box = n_boxes * (5 * np.int32(0).itemsize + 2 * np.int64(0).itemsize) + +########### +## Boxes ## +########### + +range1d = np.arange(boxes1d, dtype=np.int32) + +box_cpu_x = np.reshape(np.transpose(np.tile(range1d, ( 1, boxes1d**2))), (1, n_boxes)) +box_cpu_y = np.reshape(np.transpose(np.tile(range1d, ( boxes1d, boxes1d))), (1, n_boxes)) +box_cpu_z = np.reshape(np.transpose(np.tile(range1d, (boxes1d**2, 1))), (1, n_boxes)) +box_cpu_number = np.arange(n_boxes, dtype=np.int32) +box_cpu_offset = NUMBER_PAR_PER_BOX * box_cpu_number + +# Check that this is not wider than int32, otherwise it's hard to print with the emit method! +assert all(np.iinfo(np.int32).min <= offset <= np.iinfo(np.int32).max for offset in box_cpu_offset) +# Append the MSbs to create a "long" dtype +box_cpu_offset_msb = np.zeros(np.shape(box_cpu_offset)) + +################ +## Neighbours ## +################ + +n_nei_1d = 3 +n_nei = n_nei_1d**3 +range1d_nei = np.arange(-np.floor(n_nei_1d / 2), np.ceil(n_nei_1d / 2), dtype=np.int32) + +# Helper vectors to find the neighbour coordinates +mod_nei_x = np.reshape(np.transpose(np.tile(range1d_nei, ( 1, n_nei_1d**2))), (n_nei, 1)) +mod_nei_y = np.reshape(np.transpose(np.tile(range1d_nei, ( n_nei_1d, n_nei_1d))), (n_nei, 1)) +mod_nei_z = np.reshape(np.transpose(np.tile(range1d_nei, (n_nei_1d**2, 1))), (n_nei, 1)) + +# Find the neighbour coordinates +nei_x = np.tile(box_cpu_x, (n_nei, 1)) + mod_nei_x +nei_y = np.tile(box_cpu_y, (n_nei, 1)) + mod_nei_y +nei_z = np.tile(box_cpu_z, (n_nei, 1)) + mod_nei_z +nei_number = nei_z * boxes1d**2 + nei_y * boxes1d + nei_x +nei_offset = NUMBER_PAR_PER_BOX * nei_number + +# Remove the neighbours equal to each reference particle +ref_idx = int(np.floor(n_nei / 2)) +nei_x = np.delete(nei_x , ref_idx, 0) +nei_y = np.delete(nei_y , ref_idx, 0) +nei_z = np.delete(nei_z , ref_idx, 0) +nei_number = np.delete(nei_number, ref_idx, 0) +nei_offset = np.delete(nei_offset, ref_idx, 0) + +# Check that this is not wider than int32, otherwise it's hard to print with the emit method! +assert all(np.iinfo(np.int32).min <= offset <= np.iinfo(np.int32).max for col in nei_offset for offset in col) +# Append the MSbs to create a "long" dtype +nei_offset_msb = np.zeros(np.shape(nei_offset)) + +# Find how many neighbours are valid for each box +tmp0 = np.reshape(np.array([0 <= x < boxes1d for row in nei_x for x in row]), (n_nei-1, n_boxes)) +tmp1 = np.reshape(np.array([0 <= y < boxes1d for row in nei_y for y in row]), (n_nei-1, n_boxes)) +tmp2 = np.reshape(np.array([0 <= z < boxes1d for row in nei_z for z in row]), (n_nei-1, n_boxes)) +tmp3 = np.logical_and(tmp0, tmp1) +is_valid_nn = np.logical_and(tmp2, tmp3) +box_nn_list = [sum(e[e==True]) for e in np.transpose(is_valid_nn)] +box_cpu_nn = np.array(box_nn_list).astype(np.int32) + +# Fix wrongly calculated numbers and offsets +nei_number *= is_valid_nn +nei_offset *= is_valid_nn + +############################################ +## Parameters, distance, charge and force ## +############################################ + +rand.seed() +# Input distances +rv_cpu_v = np.random.uniform(low=0.1, high=1, size=(n_elm)).astype(dtype) +rv_cpu_x = np.random.uniform(low=0.1, high=1, size=(n_elm)).astype(dtype) +rv_cpu_y = np.random.uniform(low=0.1, high=1, size=(n_elm)).astype(dtype) +rv_cpu_z = np.random.uniform(low=0.1, high=1, size=(n_elm)).astype(dtype) + +# Input charge +qv_cpu = np.random.uniform(low=0.1, high=1, size=(n_elm)).astype(dtype) + +# Output forces +fv_cpu_v = np.zeros(n_elm).astype(dtype) +fv_cpu_x = np.zeros(n_elm).astype(dtype) +fv_cpu_y = np.zeros(n_elm).astype(dtype) +fv_cpu_z = np.zeros(n_elm).astype(dtype) + +################### +## Final structs ## +################### + +nn_mem = np.zeros((n_boxes * 6 * 26), dtype=np.int32) +nn_mem[0::6] = np.reshape(np.transpose(nei_x), 26 * n_boxes) +nn_mem[1::6] = np.reshape(np.transpose(nei_y), 26 * n_boxes) +nn_mem[2::6] = np.reshape(np.transpose(nei_z), 26 * n_boxes) +nn_mem[3::6] = np.reshape(np.transpose(nei_number), 26 * n_boxes) +nn_mem[4::6] = np.reshape(np.transpose(nei_offset), 26 * n_boxes) +nn_mem[5::6] = np.reshape(np.transpose(nei_offset_msb), 26 * n_boxes) + +box_cpu_mem = np.zeros((7 + (6 * 26)) * n_boxes).astype(np.int32) +box_cpu_mem[0::(7 + (6 * 26))] = box_cpu_x +box_cpu_mem[1::(7 + (6 * 26))] = box_cpu_y +box_cpu_mem[2::(7 + (6 * 26))] = box_cpu_z +box_cpu_mem[3::(7 + (6 * 26))] = box_cpu_number +box_cpu_mem[4::(7 + (6 * 26))] = box_cpu_offset +box_cpu_mem[5::(7 + (6 * 26))] = box_cpu_offset_msb +box_cpu_mem[6::(7 + (6 * 26))] = box_cpu_nn +for i in range(0, n_boxes): + box_cpu_mem[7 + i*(7 + 6 * 26):(i+1)*(7 + 6 * 26)] = nn_mem[i*(6 * 26):(i+1)*(6 * 26)]; + +rv_cpu_mem = np.zeros(4 * n_elm).astype(dtype) +rv_cpu_mem[0::4] = rv_cpu_v +rv_cpu_mem[1::4] = rv_cpu_x +rv_cpu_mem[2::4] = rv_cpu_y +rv_cpu_mem[3::4] = rv_cpu_z + +qv_cpu_mem = qv_cpu.astype(dtype) + +fv_cpu_mem = np.zeros(4 * n_elm).astype(dtype) +fv_cpu_mem[0::4] = fv_cpu_v +fv_cpu_mem[1::4] = fv_cpu_x +fv_cpu_mem[2::4] = fv_cpu_y +fv_cpu_mem[3::4] = fv_cpu_z + +##################### +## Create the file ## +##################### + +print(".section .data,\"aw\",@progbits") +emit("n_boxes", np.array(n_boxes, dtype=np.uint64)) +emit("alpha", np.array(alpha, dtype=dtype)) +emit("NUMBER_PAR_PER_BOX", np.array(NUMBER_PAR_PER_BOX, dtype=np.uint64)) +emit("box_cpu_mem", box_cpu_mem, 'NR_LANES*4') +emit("rv_cpu_mem", rv_cpu_mem, 'NR_LANES*4') +emit("qv_cpu_mem", qv_cpu_mem, 'NR_LANES*4') +emit("fv_s_cpu_mem", fv_cpu_mem, 'NR_LANES*4') +emit("fv_v_cpu_mem", fv_cpu_mem, 'NR_LANES*4') diff --git a/scripts/benchmark.gnuplot b/scripts/benchmark.gnuplot index fe212cd12..db655cb4e 100644 --- a/scripts/benchmark.gnuplot +++ b/scripts/benchmark.gnuplot @@ -418,3 +418,35 @@ plot roof_cpu(x, 1, 2.4) w l lw 2 lc 1 t '2 Lanes', \ roof_cpu(x, 8, 19.2) w l lw 2 lc 7 t '16 Lanes', \ 'roi_align_16.benchmark' w p lw 2 lc 7 pt 5 notitle, \ 'roi_align_16_ideal.benchmark' w p lw 2 lc 7 pt 4 notitle + +############ +## lavaMD ## +############ + +# Title +set title "lavamd performance, (depth: #elements)" + +# Set the range +set xrange [32:1024] + +# Set axis labels +set xlabel 'Depth: (#elements)' +set ylabel 'Performance (OP/cycle)' + +# Output png +set term png +set out "lavamd.png" + +# Plot the rooflines for 32-bit data +plot roof_cpu(x, 1, 5.24) w l lw 2 lc 1 t '2 Lanes', \ + 'lavamd_2.benchmark' w p lw 2 lc 1 pt 5 notitle, \ + 'lavamd_2_ideal.benchmark' w p lw 2 lc 1 pt 4 notitle, \ + roof_cpu(x, 2, 5.5) w l lw 2 lc 2 t '4 Lanes', \ + 'lavamd_4.benchmark' w p lw 2 lc 2 pt 5 notitle, \ + 'lavamd_4_ideal.benchmark' w p lw 2 lc 2 pt 4 notitle, \ + roof_cpu(x, 4, 5.5) w l lw 2 lc 3 t '8 Lanes', \ + 'lavamd_8.benchmark' w p lw 2 lc 3 pt 5 notitle, \ + 'lavamd_8_ideal.benchmark' w p lw 2 lc 3 pt 4 notitle, \ + roof_cpu(x, 8, 5.5) w l lw 2 lc 7 t '16 Lanes', \ + 'lavamd_16.benchmark' w p lw 2 lc 7 pt 5 notitle, \ + 'lavamd_16_ideal.benchmark' w p lw 2 lc 7 pt 4 notitle diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh index 04732b5ac..e036ebd73 100755 --- a/scripts/benchmark.sh +++ b/scripts/benchmark.sh @@ -750,6 +750,50 @@ roi_align() { done } +############ +## lavaMD ## +############ + +lavamd() { + + kernel=lavamd + defines="" + + # Run pseudo-lavamd by default, + # so box1d is not really important + box1d=2 + alpha=0.5 + # Depend on the implementation + lmul=1 + sew=32 + maxelm=$(( ($vlen * $lmul) / $sew )) + + tempfile=`mktemp` + + # Log the performance results + > ${kernel}_${nr_lanes}.benchmark + > ${kernel}_${nr_lanes}_ideal.benchmark + + for par4box in 4 8 16 32 64 96 128 256 512; do + + args="$box1d $par4box $alpha $maxelm" + metadata="$kernel $nr_lanes $par4box $sew" + + clean_and_gen_data $kernel "$args" || exit + + # Default System + compile_and_run $kernel "$defines" $tempfile 0 || exit + extract_performance $kernel "$metadata 0" "$args" $tempfile ${kernel}_${nr_lanes}.benchmark || exit + + # Ideal Dispatcher System, if QuestaSim is available + if [ "$ci" == 0 ]; then + compile_and_run $kernel "$defines" $tempfile 1 || exit + extract_performance $kernel "$metadata 1" "$args" $tempfile ${kernel}_${nr_lanes}_ideal.benchmark || exit + verify_id_results 10 $sew || exit + fi + done +} + case $1 in "imatmul" | "fmatmul") matmul $1 @@ -803,6 +847,10 @@ case $1 in roi_align ;; + "lavamd") + lavamd + ;; + *) echo "Benchmarking all the apps." matmul fmatmul @@ -817,5 +865,6 @@ case $1 in dotproduct pathfinder roi_align + lavamd ;; esac diff --git a/scripts/check_cycles.py b/scripts/check_cycles.py index 575fba32a..c86e867b8 100644 --- a/scripts/check_cycles.py +++ b/scripts/check_cycles.py @@ -39,6 +39,7 @@ 'dotproduct' : 500, 'fdotproduct': 500, 'roi_align' : 500, + 'lavamd' : 500, } skip_check = { @@ -57,6 +58,7 @@ 'dotproduct' : 0, 'fdotproduct': 0, 'roi_align' : 1, # This program has a larger scalar component + 'lavamd' : 0, } def main(): diff --git a/scripts/performance.py b/scripts/performance.py index 8fe3bb433..3d74f28bf 100755 --- a/scripts/performance.py +++ b/scripts/performance.py @@ -113,6 +113,14 @@ def roi_align(args, cycles): crop_w = int(args[6]) performance = 9 * depth / cycles return [depth, performance] +def lavamd(args, cycles): + box1d = int(args[0]) + par4box = int(args[1]) + alpha = float(args[2]) + maxelm = int(args[3]) + # pseudo lavaMD iteration bounds: 1, 2, 4, par4box + performance = (1 * 2 * 4 * (51 * par4box + 4 * min(par4box, maxelm))) / cycles + return [par4box, performance] perfExtr = { 'imatmul' : imatmul, @@ -130,6 +138,7 @@ def roi_align(args, cycles): 'dotproduct' : dotproduct, 'fdotproduct': fdotproduct, 'roi_align' : roi_align, + 'lavamd' : lavamd, } # Maximum performance if Ara's BW can be fully utilized @@ -149,6 +158,7 @@ def roi_align(args, cycles): 'dotproduct' : lambda l, s : l * 8/s, 'fdotproduct': lambda l, s : l * 8/s, 'roi_align' : lambda l, s : l * 8/s, + 'lavamd' : lambda l, s : 0, } # Maximum performance taking into account Ara's limited @@ -169,6 +179,7 @@ def roi_align(args, cycles): 'dotproduct' : lambda l, s : 4 * l/s, 'fdotproduct': lambda l, s : 4 * l/s, 'roi_align' : lambda l, s : 3/5 * l * 8/s, + 'lavamd' : lambda l, s : 0, } def main():