wasm-slice

#!/usr/bin/env python3

import sys, subprocess, concurrent.futures, re, os, re, subprocess, csv, shutil

# this takes upto 150GB of memory.
# TODO: why?
WASMR3_PATH = os.getenv("WASMR3_PATH", "/home/wasm-r3")
PARALLEL = os.getenv
TIMEOUT = 120

# The Wasm file is given as the first and only argument to the script.
if len(sys.argv) != 3:
    print("Usage: wasm-slice <WASM_FILE>")
    sys.exit(1)

interesting_script = sys.argv[1]
test_input = sys.argv[2]
test_name = os.path.splitext(os.path.basename(test_input))[0]

# We prioritize single function replay of heuristic > dynamic > all

def get_heuristic_fidx() -> list:
    # Hardcode for now
    # TODO: implement heuristics
    testname_to_heuristic = {
        "commanderkeen": [490],
        "funky-kart": [1529],
        "guiicons": [213],
        "hydro": [1290],
        "jsc": [4771],
        "mandelbrot": [7],
        "rfxgen": [271],
        "rguilayout": [216],
        "rguistyler": [260],
        "riconpacker": [244],
        "rtexviewer": [303],
        "sqlgui": [917],
    }
    return testname_to_heuristic[test_name]

def get_dynamic_fidx():
    command = ["wizeng.x86-64-linux", "-no-names", "-csv", "--monitors=icount", test_input]
    try:
        return extract_dynamic_fidx(subprocess.check_output(command, text=True))
    except subprocess.CalledProcessError as e:
        print(f"Command failed: {' '.join(command)}\nError: {e.output}")
        raise

def extract_dynamic_fidx(csv_output: str):
    dynamic_fidx = []
    reader = csv.reader(csv_output.splitlines())
    next(reader)  # Skip header
    for row in reader:
        if len(row) >= 3:
            function_index = int(row[0].lstrip('#'))
            dynamic_count = int(row[2])
            if dynamic_count > 0 and function_index != 0:
                dynamic_fidx.append(function_index)
    return sorted(dynamic_fidx)

def get_all_fidx():
    command = f"wasm-tools objdump {test_input}"
    output = subprocess.check_output(command, shell=True, text=True)
    count = extract_function_count(output)
    return list(range(count))

def extract_function_count(objdump_output):
    for line in objdump_output.split('\n'):
        if 'functions' in line:
            match = re.search(r'(\d+) count', line)
            if match:
                return int(match.group(1))
    return None


test_get_function_count = """
types                                  |        0xb -      0x198 |       397 bytes | 55 count
functions                              |      0x19b -      0x396 |       507 bytes | 505 count
tables                                 |      0x398 -      0x39f |         7 bytes | 1 count
memories                               |      0x3a1 -      0x3a7 |         6 bytes | 1 count
globals                                |      0x3aa -      0x930 |      1414 bytes | 282 count
exports                                |      0x933 -     0x100c |      1753 bytes | 301 count
elements                               |     0x100f -     0x1193 |       388 bytes | 1 count
data count                             |     0x1195 -     0x1197 |         2 bytes | 1 count
code                                   |     0x119b -    0x46e2b |    285840 bytes | 505 count
data                                   |    0x46e2f -    0x5c006 |     86487 bytes | 1422 count
"""

assert extract_function_count(test_get_function_count) == 505

def get_fidx():
    heuristic_fidx = get_heuristic_fidx()
    dynamic_fidx = get_dynamic_fidx()
    all_fidx = get_all_fidx()
    
    print("Heuristic function indices:", heuristic_fidx)
    dynamic_filtered = [idx for idx in dynamic_fidx if idx not in heuristic_fidx]
    print("Dynamic function indices (except heuristic):", dynamic_filtered)
    
    # Filter out both heuristic and dynamic indices from all
    all_filtered = [idx for idx in all_fidx if idx not in heuristic_fidx and idx not in dynamic_fidx]
    print("All function indices (except heuristic and dynamic):", all_filtered)

    # Create a set to keep track of added indices
    added = set()
    combined_fidx = []

    # Add heuristic_fidx first
    for idx in heuristic_fidx:
        if idx not in added:
            combined_fidx.append(idx)
            added.add(idx)

    # Add dynamic_fidx second
    for idx in dynamic_fidx:
        if idx not in added:
            combined_fidx.append(idx)
            added.add(idx)

    # Add all_fidx last
    for idx in all_fidx:
        if idx not in added:
            combined_fidx.append(idx)
            added.add(idx)

    return combined_fidx


def run_slicedice(testname, fidx):
    try:
        # fidxargs = " ".join([f"-i {f}" for f in fidx.split("-")])
        command = f"timeout {TIMEOUT}s npm test slicedice -- -t {testname} -i {fidx}"
        output = subprocess.check_output(command, shell=True, text=True)
        # TODO: make this configurable
        replay_wasm_path = f'{WASMR3_PATH}/benchmarks/{test_name}/out/{fidx}/benchmarks/bin_1/replay.wasm'
        shutil.copy(replay_wasm_path, f'{WASMR3_PATH}/benchmarks/{test_name}/{test_name}.sliced.wasm')
        interestingness_command = [interesting_script, replay_wasm_path]
        result = subprocess.run(interestingness_command, check=False)
        test_input_size = os.path.getsize(test_input)
        replay_wasm_size = os.path.getsize(replay_wasm_path)
        print("Test input file size:", test_input_size)
        print("Sliced wasm file size:", replay_wasm_size)
        sys.exit(result.returncode)
    except Exception as e:
        print(f"Failed to run {testname} - {fidx}")
        print(e)
        return [testname, fidx, "fail"]

for fidx in get_fidx():
    run_slicedice(test_name, fidx)