From ebefd45097c4e4a2821e4cbc48f3f02665df5e1b Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 30 Apr 2021 19:40:20 +0100 Subject: [PATCH 001/141] [datasets] Update AnghaBench to v1. anghabench-v1 uses an amended manifest, but is otherwise the same. --- compiler_gym/envs/llvm/datasets/__init__.py | 19 +++++++++++++ compiler_gym/envs/llvm/datasets/anghabench.py | 28 +++++++++++++------ docs/source/llvm/index.rst | 2 +- tests/llvm/datasets/anghabench_test.py | 14 +++++----- tests/llvm/datasets/llvm_datasets_test.py | 2 +- 5 files changed, 47 insertions(+), 18 deletions(-) diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py index e83cf0d6a..74254497c 100644 --- a/compiler_gym/envs/llvm/datasets/__init__.py +++ b/compiler_gym/envs/llvm/datasets/__init__.py @@ -212,6 +212,25 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset site_data_base = site_data_base or site_data_path("llvm-v0") yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0) + # Add legacy version of Anghabench using an old manifest. + anghabench_v0_manifest_url, anghabench_v0_manifest_sha256 = { + "darwin": ( + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2", + "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1", + ), + "linux": ( + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2", + "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477", + ), + }[sys.platform] + yield AnghaBenchDataset( + name="benchmark://anghabench-v0", + site_data_base=site_data_base, + sort_order=0, + manifest_url=anghabench_v0_manifest_url, + manifest_sha256=anghabench_v0_manifest_sha256, + deprecated="Please use anghabench-v1", + ) yield BlasDataset(site_data_base=site_data_base, sort_order=0) yield CLgenDataset(site_data_base=site_data_base, sort_order=0) yield CBenchDataset(site_data_base=site_data_base, sort_order=-1) diff --git a/compiler_gym/envs/llvm/datasets/anghabench.py b/compiler_gym/envs/llvm/datasets/anghabench.py index bfcb46a65..ecee29f6a 100644 --- a/compiler_gym/envs/llvm/datasets/anghabench.py +++ b/compiler_gym/envs/llvm/datasets/anghabench.py @@ -6,6 +6,7 @@ import sys from concurrent.futures import as_completed from pathlib import Path +from typing import Optional from compiler_gym.datasets import Benchmark, TarDatasetWithManifest from compiler_gym.datasets.benchmark import BenchmarkWithSource @@ -38,19 +39,27 @@ class AnghaBenchDataset(TarDatasetWithManifest): overhead of compiling it from C to bitcode. This is a one-off cost. """ - def __init__(self, site_data_base: Path, sort_order: int = 0): - manifest_url, manifest_sha256 = { + def __init__( + self, + site_data_base: Path, + sort_order: int = 0, + manifest_url: Optional[str] = None, + manifest_sha256: Optional[str] = None, + deprecated: Optional[str] = None, + name: Optional[str] = None, + ): + manifest_url_, manifest_sha256_ = { "darwin": ( - "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2", - "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1", + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v1-macos-manifest.bz2", + "96ead63da5f8efa07fd0370f0c6e452b59bed840828b8b19402102b1ce3ee109", ), "linux": ( - "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2", - "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477", + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v1-linux-manifest.bz2", + "14df85f650199498cf769715e9f0d7841d09f9fa62a95b8ecc242bdaf227f33a", ), }[sys.platform] super().__init__( - name="benchmark://anghabench-v0", + name=name or "benchmark://anghabench-v1", description="Compile-only C/C++ functions extracted from GitHub", references={ "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf", @@ -58,8 +67,8 @@ def __init__(self, site_data_base: Path, sort_order: int = 0): }, license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1", site_data_base=site_data_base, - manifest_urls=[manifest_url], - manifest_sha256=manifest_sha256, + manifest_urls=[manifest_url or manifest_url_], + manifest_sha256=manifest_sha256 or manifest_sha256_, tar_urls=[ "https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz" ], @@ -68,6 +77,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0): tar_compression="gz", benchmark_file_suffix=".bc", sort_order=sort_order, + deprecated=deprecated, ) def benchmark(self, uri: str) -> Benchmark: diff --git a/docs/source/llvm/index.rst b/docs/source/llvm/index.rst index 17115cd6b..eb832d5b0 100644 --- a/docs/source/llvm/index.rst +++ b/docs/source/llvm/index.rst @@ -21,7 +21,7 @@ We provide several datasets of open-source LLVM-IR benchmarks for use: +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | Dataset | Num. Benchmarks [#f1]_ | Description | Validatable [#f2]_ | +============================+==========================+====================================================================================================================================================================================================================+======================+ -| benchmark://anghabench-v0 | 1,042,976 | Compile-only C/C++ functions extracted from GitHub [`Homepage `__, `Paper `__] | No | +| benchmark://anghabench-v1 | 1,041,333 | Compile-only C/C++ functions extracted from GitHub [`Homepage `__, `Paper `__] | No | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | benchmark://blas-v0 | 300 | Basic linear algebra kernels [`Homepage `__, `Paper `__] | No | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ diff --git a/tests/llvm/datasets/anghabench_test.py b/tests/llvm/datasets/anghabench_test.py index 0d1026e82..bb4149dca 100644 --- a/tests/llvm/datasets/anghabench_test.py +++ b/tests/llvm/datasets/anghabench_test.py @@ -23,7 +23,7 @@ def anghabench_dataset() -> AnghaBenchDataset: env = gym.make("llvm-v0") try: - ds = env.datasets["anghabench-v0"] + ds = env.datasets["anghabench-v1"] finally: env.close() yield ds @@ -31,9 +31,9 @@ def anghabench_dataset() -> AnghaBenchDataset: def test_anghabench_size(anghabench_dataset: AnghaBenchDataset): if sys.platform == "darwin": - assert anghabench_dataset.size == 1042908 + assert anghabench_dataset.size == 1041265 else: - assert anghabench_dataset.size == 1042976 + assert anghabench_dataset.size == 1041333 def test_missing_benchmark_name(anghabench_dataset: AnghaBenchDataset, mocker): @@ -41,15 +41,15 @@ def test_missing_benchmark_name(anghabench_dataset: AnghaBenchDataset, mocker): mocker.patch.object(anghabench_dataset, "install") with pytest.raises( - LookupError, match=r"^No benchmark specified: benchmark://anghabench-v0$" + LookupError, match=r"^No benchmark specified: benchmark://anghabench-v1$" ): - anghabench_dataset.benchmark("benchmark://anghabench-v0") + anghabench_dataset.benchmark("benchmark://anghabench-v1") anghabench_dataset.install.assert_called_once() with pytest.raises( - LookupError, match=r"^No benchmark specified: benchmark://anghabench-v0/$" + LookupError, match=r"^No benchmark specified: benchmark://anghabench-v1/$" ): - anghabench_dataset.benchmark("benchmark://anghabench-v0/") + anghabench_dataset.benchmark("benchmark://anghabench-v1/") assert anghabench_dataset.install.call_count == 2 diff --git a/tests/llvm/datasets/llvm_datasets_test.py b/tests/llvm/datasets/llvm_datasets_test.py index 495f17a6a..892d1b524 100644 --- a/tests/llvm/datasets/llvm_datasets_test.py +++ b/tests/llvm/datasets/llvm_datasets_test.py @@ -14,7 +14,7 @@ def test_default_dataset_list(): try: assert list(d.name for d in env.datasets) == [ "benchmark://cbench-v1", - "benchmark://anghabench-v0", + "benchmark://anghabench-v1", "benchmark://blas-v0", "benchmark://clgen-v0", "benchmark://github-v0", From e135a48e707ca9d7b0e199ec6d6d3807b6045009 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Sun, 2 May 2021 18:23:14 +0100 Subject: [PATCH 002/141] [leaderboard] Flush results to CSV file immediately. --- compiler_gym/leaderboard/llvm_instcount.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler_gym/leaderboard/llvm_instcount.py b/compiler_gym/leaderboard/llvm_instcount.py index 9d913bb4a..bbda9baa2 100644 --- a/compiler_gym/leaderboard/llvm_instcount.py +++ b/compiler_gym/leaderboard/llvm_instcount.py @@ -131,7 +131,7 @@ def run(self): state = self.env.state.copy() state.walltime = timer.time - writer.write_state(state) + writer.write_state(state, flush=True) self.states.append(state) if not self.alive: From c5bd82a4936890de15b3eac6006c82f6bfed0624 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Sun, 2 May 2021 18:26:20 +0100 Subject: [PATCH 003/141] Add missing copyright header. --- tests/llvm/invalid_ir.ll | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/llvm/invalid_ir.ll b/tests/llvm/invalid_ir.ll index ceea57466..fc76054e5 100644 --- a/tests/llvm/invalid_ir.ll +++ b/tests/llvm/invalid_ir.ll @@ -2,6 +2,11 @@ ; This IR file can be assembled: $ lvm-as tests/llvm/invalid_ir.ll ; But it cannot be compiled: $ clang tests/llvm/invalid_ir.ll ; The error is: "error in backend: Cannot emit physreg copy instruction" +; +; Copyright (c) Facebook, Inc. and its affiliates. +; +; This source code is licensed under the MIT license found in the +; LICENSE file in the root directory of this source tree. ; ModuleID = '' source_filename = "/tmp/autogen.bc" From 9a5cb5aa5dba52b39e892a46796747fec7f9f877 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Mon, 3 May 2021 13:19:01 +0100 Subject: [PATCH 004/141] [README] Re-org the installation/usage structure. Reduce the depth of TOC and re-organize it so that we first include instructions for installation, then for usage. --- README.md | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index f6ee38e96..e1557e99f 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,9 @@ developers to expose new optimization problems for AI. **Table of Contents** - [Features](#features) -- [Getting Started](#getting-started) - - [Installation](#installation) - - [Building from Source](#building-from-source) - - [Trying it out](#trying-it-out) +- [Installation](#installation) + - [Building from Source](#building-from-source) +- [Usage](#usage) - [Leaderboards](#leaderboards) - [LLVM Instruction Count](#llvm-instruction-count) - [Contributing](#contributing) @@ -81,15 +80,7 @@ features: For a glimpse of what's to come, check out [our roadmap](https://github.com/facebookresearch/CompilerGym/projects/1). -# Getting Started - -Starting with CompilerGym is simple. If you not already familiar with the gym -interface, refer to the -[getting started guide](http://facebookresearch.github.io/CompilerGym/getting_started.html) -for an overview of the key concepts. - - -## Installation +# Installation Install the latest CompilerGym release using: @@ -98,12 +89,12 @@ Install the latest CompilerGym release using: The binary works on macOS and Linux (on Ubuntu 18.04, Fedora 28, Debian 10 or newer equivalents). -### Building from Source +## Building from Source If you prefer, you may build from source. This requires a modern C++ toolchain and bazel. -#### macOS +### macOS On macOS the required dependencies can be installed using [homebrew](https://docs.brew.sh/Installation): @@ -117,7 +108,7 @@ export PKG_CONFIG_PATH="/usr/local/opt/zlib/lib/pkgconfig" Now proceed to [All platforms](#all-platforms) below. -#### Linux +### Linux On debian-based linux systems, install the required toolchain using: @@ -130,7 +121,7 @@ export CC=clang export CXX=clang++ ``` -#### All platforms +### All platforms We recommend using [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) @@ -173,8 +164,12 @@ environment using: conda deactivate conda env remove -n compiler_gym +# Usage -## Trying it out +Starting with CompilerGym is simple. If you not already familiar with the gym +interface, refer to the [getting started +guide](http://facebookresearch.github.io/CompilerGym/getting_started.html) for +an overview of the key concepts. In Python, import `compiler_gym` to use the environments: From 168813780a155acf425202fb0d47dbe1a5e9e254 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Tue, 4 May 2021 11:02:46 +0100 Subject: [PATCH 005/141] [datasets] Remove fast-path for TarDataset.installed check. --- compiler_gym/datasets/tar_dataset.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/compiler_gym/datasets/tar_dataset.py b/compiler_gym/datasets/tar_dataset.py index 55b15c73b..632ce935e 100644 --- a/compiler_gym/datasets/tar_dataset.py +++ b/compiler_gym/datasets/tar_dataset.py @@ -60,17 +60,13 @@ def __init__( self.tar_compression = tar_compression self.strip_prefix = strip_prefix - self._installed = False self._tar_extracted_marker = self.site_data_path / ".extracted" self._tar_lock = Lock() self._tar_lockfile = self.site_data_path / ".install_lock" @property def installed(self) -> bool: - # Fast path for repeated checks to 'installed' without a disk op. - if not self._installed: - self._installed = self._tar_extracted_marker.is_file() - return self._installed + return self._tar_extracted_marker.is_file() def install(self) -> None: super().install() From e9435c49fbb4d7014606cf045d12fcd95befb8a3 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Tue, 4 May 2021 11:32:10 +0100 Subject: [PATCH 006/141] [README] Tweak example usage instructions Don't use the `env.benchmark` attribute, and use active writing style. --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e1557e99f..4b0660316 100644 --- a/README.md +++ b/README.md @@ -175,12 +175,14 @@ In Python, import `compiler_gym` to use the environments: ```py >>> import gym ->>> import compiler_gym # imports the CompilerGym environments ->>> env = gym.make("llvm-autophase-ic-v0") # starts a new environment ->>> env.benchmark = "benchmark://cbench-v1/qsort" # select a program to compile ->>> env.reset() # starts a new compilation session ->>> env.render() # prints the IR of the program ->>> env.step(env.action_space.sample()) # applies a random optimization, updates state/reward/actions +>>> import compiler_gym # import the CompilerGym environments +>>> env = gym.make( # create a new environment +... "llvm-autophase-ic-v0" # select the compiler optimization task +... benchmark="cbench-v1/qsort" # select the program to compile +... ) +>>> env.reset() # start a new compilation session +>>> env.render() # print the IR of the program +>>> env.step(env.action_space.sample()) # apply a random optimization, update state/reward/actions ``` See the [documentation website](http://facebookresearch.github.io/CompilerGym/) From 34fab8c934f03a9f0e03695b3a3c35079ba2491a Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Tue, 4 May 2021 11:59:04 +0100 Subject: [PATCH 007/141] [third party] Prune dead code from inst2vec. Much of the imported inst2vec code is unused. This removes it. --- compiler_gym/third_party/inst2vec/BUILD | 6 - .../inst2vec/inst2vec_preprocess.py | 3003 +---------------- .../third_party/inst2vec/inst2vec_utils.py | 79 - .../third_party/inst2vec/rgx_utils.py | 74 - 4 files changed, 1 insertion(+), 3161 deletions(-) delete mode 100644 compiler_gym/third_party/inst2vec/inst2vec_utils.py diff --git a/compiler_gym/third_party/inst2vec/BUILD b/compiler_gym/third_party/inst2vec/BUILD index 0586c2a2e..d0b260fa9 100644 --- a/compiler_gym/third_party/inst2vec/BUILD +++ b/compiler_gym/third_party/inst2vec/BUILD @@ -33,16 +33,10 @@ py_library( name = "inst2vec_preprocess", srcs = ["inst2vec_preprocess.py"], deps = [ - ":inst2vec_utils", ":rgx_utils", ], ) -py_library( - name = "inst2vec_utils", - srcs = ["inst2vec_utils.py"], -) - py_library( name = "rgx_utils", srcs = ["rgx_utils.py"], diff --git a/compiler_gym/third_party/inst2vec/inst2vec_preprocess.py b/compiler_gym/third_party/inst2vec/inst2vec_preprocess.py index f377c6be6..7610aa95f 100644 --- a/compiler_gym/third_party/inst2vec/inst2vec_preprocess.py +++ b/compiler_gym/third_party/inst2vec/inst2vec_preprocess.py @@ -30,485 +30,9 @@ import networkx as nx -from compiler_gym.third_party.inst2vec import inst2vec_utils as i2v_utils from compiler_gym.third_party.inst2vec import rgx_utils as rgx -######################################################################################################################## -# Helper functions: list and stmt handling -######################################################################################################################## -def string_of_items(dic): - """ - Return a string containing all keys of a dictionary, separated by a comma - (Helper function for structure inlining) - :param dic: dictionary [key=string: value=string] - :return: string constructed of the dictionaries' keys - """ - s = "" - for k, v in dic.items(): - s += k + ": " + v + "\n" - return s - - -def collapse_into_one_list(data): - """ - Collapse list of list of strings into one list of strings - :param data: list of list of strings - :return: list of strings - """ - data_ = list() - for i in range(len(data)): - for j in range(len(data[i])): - data_.append(data[i][j]) - - return data_ - - -def string_from_list(l): - """ - Construct a string from a list of strings - :param l: list of strings - :return: string containing elements of list l separated by a comma - """ - s = l[0] - if len(l) > 1: - for i in range(len(l) - 1): - # only add this string to the list if it is different from the previous strings - e = l[i + 1] - if e not in l[0 : i + 1]: - s += ",\t\t" + e - return s - - -def create_list_stmts(list_graphs): - """ - Create a unique list of statements (strings) from a list of graphs in which statements are attributes of edges - :param list_graphs: list of context-graphs (nodes = ids, edges = statements) - :return: list_stmts: a unique list of statements (strings) - """ - list_stmts = list() - for G in list_graphs: - edges_list = [e[2]["stmt"] for e in G.edges(data=True)] - list_stmts += edges_list - - return list_stmts - - -######################################################################################################################## -# Counting and statistics -######################################################################################################################## -def get_stmt_counts(data_set, data_list): - """ - Get statement counts - :param data_set: set containing the elements from data_list but without repetitions and ordered - :param data_list: list of string statements with repetitions and no ordering - :return: data_count: dictionary with pairs [stmt, number of occurrences in data_list] - the order of the statements is the same as the one in data_set - data_operations_count: list of tuples - [string "tag level 1", "tag level 2", "tag level 3", int "number of occurrences"] - """ - # Setup variables - data_count = {x: 0 for x in data_set} - data_operations_count = list() - - # Compute stmt counts (overall) - print("Counting statement occurrences (overall)...") - for stmt in data_list: - data_count[stmt] += 1 - - # Check that all stmts have been counted (for debugging purposes) - total_stmt_count = sum(data_count.values()) - assert total_stmt_count == len(data_list), "Not all statements have been counted" - - # Compute stmt counts (by family) - print("Counting statement occurrences (by family) ...") - total_stmt_count = 0 - stmts_categorized = list() - - # Loop over stmt families - for fam in rgx.llvm_IR_stmt_families: - op_count = 0 - - # loop on all stmts in data - for i in range(len(data_set)): - # if the regular expression for the family matches - if re.match(fam[3], data_set[i], re.MULTILINE): - # add the corresponding number of occurrences to the counter - op_count += data_count[data_set[i]] - stmts_categorized.append(i) - - # append the count to the list of number of occurrences - data_operations_count.append([fam[0], fam[1], fam[2], op_count]) - - # increase the total stmt count - total_stmt_count += op_count - - # Check that all stmts have been categorized once and only once (debugging purposes) - print("Starting categorization check ...") - stmts_categorized = sorted(stmts_categorized) - if stmts_categorized != list(range(len(data_set))): - print("Tracking down the errors in categorization ... : ") - for i in range(len(data_set)): - num = stmts_categorized.count(i) - if num == 0: - print(data_set[i], "\n\tappears 0 times") - if num > 1: - print(data_set[i], "\n\tappears ", num, " times") - - assert stmts_categorized <= list( - range(len(data_set)) - ), "Not all statements have been categorized" - assert stmts_categorized >= list( - range(len(data_set)) - ), "Some statements have been categorized multiple times" - assert total_stmt_count == len(data_list), "Not all statements have been counted" - - return data_count, data_operations_count - - -def data_statistics(data, descr): - """ - Compute and print some statistics on the data - :param data: list of lists of statements (strings) - :param descr: string description of the current step of the pipeline to add to output - :return: source_data_list: list of statements - source_data sorted set of statements - """ - # Create a list of statements (strings) collecting the statements from all files - source_data_list = collapse_into_one_list(data) - - # Create a sorted set of statements appearing in our data set - source_data = sorted(set(source_data_list)) - - # Get number of lines and the vocabulary size - number_lines = len(source_data_list) - vocabulary_size = len(source_data) - - # Construct output - out = ( - "After " - + descr - + ":\n" - + "--- {:<26}: {:>12,d}\n".format("Number of lines", number_lines) - + "--- {:<26}: {:>12,d}\n".format("Vocabulary size", vocabulary_size) - ) - print(out) - - # Return - return source_data_list, source_data - - -######################################################################################################################## -# Reading, writing and dumping files -######################################################################################################################## - - -def read_data_files_from_folder(foldername): - """ - Read all source files in folder - Return a list of file contents, whereby each file content is a list of strings, each string representing a line - :param foldername: name of the folder in which the data files to be read are located - :return: a list of files where each file is a list of strings - """ - # Helper variables - data = list() - file_names = list() - file_count = 0 - - print("Reading data from all files in folder ", foldername) - listing = os.listdir(foldername + "/") - to_subtract = file_count - - # Loop over files in folder - for file in listing: - if file[0] != "." and file[-3:] == ".ll": - # If this isn't a hidden file and it is an LLVM IR file ('.ll' extension), - # open file and import content - f = open(os.path.join(foldername, file), "r") - data.append( - f.read().splitlines() - ) # add this file as an element to the list "data" - f.close() - - # Add file name to dictionary - file_names.append(file) - - # Increment counters - file_count += 1 - - print("Number of files read from", foldername, ": ", file_count - to_subtract) - print("Total number of files read for dataset", foldername, ": ", file_count) - return data, file_names - - -def print_preprocessed_data(raw_data, foldername, filenames): - """ - Write pre-processed code to file for future reference - :param raw_data: a list of files where each file is a list of strings - :param foldername: folder in which to print - :param filenames: list of base file names - :return: - """ - # Make sure the directory exists - if not, create it - foldername = os.path.join(foldername, "preprocessed") - if not os.path.exists(foldername): - os.makedirs(foldername) - - # Write pre-processed code to files - i = 0 - for file in raw_data: - filename = os.path.join(foldername, filenames[i][:-3] + "_preprocessed.txt") - print("Writing pre-processed data to file ", filename) - with open(filename, "w") as f: - for l in file: - f.write(l + "\n") - i += 1 - - -def print_data(data, filename): - """ - Write pre-processed code to file for future reference - :param data: a list of strings - :param filename: name of file to print this to (string) - :return: - """ - print("Write data to file ", filename) - with open(filename, "w") as f: - for l in data: - f.write(l + "\n") - - -def sort_key(x): - """ - Helper function to sort nodes - :param x: node - :return: node name, node id type - """ - id_part = x[0][1:] - - if id_part.isdigit(): - return x[0][0], int(x[0][1:]) - else: - return x[0][0], 1 - - -def print_node_family_to_file(G, f, nodetype): - """ - Helper function for function "print_graph_to_file" - :param G: graph - :param f: file handle - :param nodetype: string corresponding to the "id" of the node family to be printed - """ - - # Construct node family - if nodetype == "root": - node_family = [ - n for n in G.nodes() if G.out_degree(n) > 0 and G.in_degree(n) == 0 - ] - node_family = sorted(node_family, key=sort_key) - elif nodetype == "leaf": - node_family = [ - n for n in G.nodes() if G.out_degree(n) == 0 and G.in_degree(n) >= 1 - ] - node_family = sorted(node_family, key=sort_key) - elif nodetype == "isolated": - node_family = [n for n in G.nodes() if G.degree(n) == 0] - node_family = sorted(node_family, key=sort_key) - else: - node_family = [ - n[0] - for n in sorted(list(G.nodes(data=True)), key=sort_key) - if n[1]["id"] == nodetype - ] - - # Write to file - f.write("#nodes: " + str(len(node_family)) + "\n") - f.write("-" * 80 + "\n") - for n in node_family: - f.write("{n:<60}\n".format(n=n)) - - -def print_graph_to_file(G, multi_edge_dic, folder, filename): - """ - Print information about a graph to a file - :param G: graph - :param multi_edge_dic: dictionary of multi-edges - = edges for which a parallel edge connecting the same two end-nodes exists - :param folder: folder in which to write - :param filename: base name of the graph - """ - # Print to file - graph_filename = os.path.join(folder, filename[:-3] + ".txt") - print("Printing graph to file : ", graph_filename) - - with open(graph_filename, "w") as f: - - # GENERAL - f.write("#nodes: " + str(G.number_of_nodes()) + "\n") - f.write("#edges: " + str(G.number_of_edges()) + "\n\n") - - # INFORMATION ON NODES - # all - f.write("Nodes (" + str(G.number_of_nodes()) + "):\n") - f.write("-" * 80 + "\n") - for n, data in sorted(G.nodes(data=True), key=sort_key): - f.write("{n:<60}, {w}\n".format(n=n[:60], w=data["id"])) - - # local - f.write("\nLocal identifier nodes: \n") - print_node_family_to_file(G, f, "local") - - # block references - f.write("\nBlock reference nodes: \n") - print_node_family_to_file(G, f, "label") - - # global - f.write("\nGlobal nodes: \n") - print_node_family_to_file(G, f, "global") - - # immediate value - f.write("\nImmediate value nodes: \n") - print_node_family_to_file(G, f, "imm_val") - - # ad_hoc - f.write("\nAd hoc value nodes: \n") - print_node_family_to_file(G, f, "ad_hoc") - - # leaf - f.write("\nLeaf nodes: \n") - print_node_family_to_file(G, f, "leaf") - - # root - f.write("\nRoot nodes: \n") - print_node_family_to_file(G, f, "root") - - # isolated - f.write("\nIsolated nodes: \n") - print_node_family_to_file(G, f, "isolated") - f.write("\n\n") - - # INFORMATION ON EDGES - # all - f.write("Edges (" + str(G.number_of_edges()) + ")\n") - f.write("-" * 80 + "\n") - for a, b, data in sorted(G.edges(data=True), key=sort_key): - f.write( - "({a:<30}, {b:<30}) {w}\n".format(a=a[:30], b=b[:30], w=data["stmt"]) - ) - - # data flow edges - dataedges = [ - (str(n[0]), str(n[1]), str(n[2])) - for n in sorted(list(G.edges(data=True)), key=sort_key) - if n[2]["flow"] == "data" - ] - f.write("\nData flow edges: \n") - f.write( - "#edges: " - + str(len(dataedges)) - + " (" - + str(int(len(dataedges)) / G.number_of_edges() * 100)[:5] - + "%)\n" - ) - f.write("-" * 80 + "\n") - for e in dataedges: - f.write("({a:<30}, {b:<30}) {c}\n".format(a=e[0][:30], b=e[1][:30], c=e[2])) - - # control flow edges - ctrledges = [ - (str(n[0]), str(n[1]), str(n[2])) - for n in sorted(list(G.edges(data=True)), key=sort_key) - if n[2]["flow"] == "ctrl" - ] - f.write("\nCtrl flow edges: \n") - f.write( - "#edges: " - + str(len(ctrledges)) - + " (" - + str(int(len(dataedges)) / G.number_of_edges() * 100)[:5] - + "%)\n" - ) - f.write("-" * 80 + "\n") - for e in ctrledges: - f.write("({a:<30}, {b:<30}) {c}\n".format(a=e[0][:30], b=e[1][:30], c=e[2])) - - # multi-edges - f.write("\nMulti-edges: \n") - multi_edge_list = list() - for k, v in multi_edge_dic.items(): # Compile the multi-edges - multi_edge_list += v - f.write( - "#multi-edges: " - + str(len(multi_edge_list)) - + " (" - + str(int(len(multi_edge_list)) / G.number_of_edges() * 100)[:5] - + "%)\n" - ) - f.write( - "#node pairs connected by multi-edges: " - + str(len(multi_edge_dic.keys())) - + " (" - + str(int(len(multi_edge_dic)) / G.number_of_edges() * 100)[:5] - + "%)\n" - ) - f.write("-" * 80 + "\n") - for k, v_ in multi_edge_dic.items(): - n = re.match(r"(.*) \|\|\| (.*)", k) - assert n is not None, "Could not identify nodes in " + k - f.write("{m:<60} {p:<60}\n".format(m=n.group(1)[:60], p=n.group(2)[:60])) - for v in v_: - f.write("\t{}\n".format(v)) - f.write("\n") - - -def print_structure_dictionary(dic, folder, filename): - """ - Print the dictionary of structures to a file - :param dic: dictionary ["structure name", [list of possible values]] - :param folder: name of folder in which to print dictionary - :param filename: name of file in which to print dictionary - :return: - """ - # Print dictionary in alphabetical order - dic_filename = os.path.join(folder, filename[:-3] + ".txt") - print('Printing dictionary to file "', dic_filename) - with open(dic_filename, "w") as f: - f.write("{:<70} {}\n\n".format("structure name", "literal value")) - for key, value in sorted(dic.items()): - f.write("{:<70} {}\n".format(key, string_from_list(value))) - - -def PrintDualXfgToFile(D, folder, filename): - """Print dual-XFG graph to file. - - :param D: dual-XFG graphs - :param folder: name of folder in which to print dictionary - :param filename: name of file in which to print dictionary - """ - # Print to file - graph_filename = os.path.join(folder, filename[:-3] + ".txt") - print("Printing graph to file : ", graph_filename) - - with open(graph_filename, "w") as f: - # GENERAL - f.write("#nodes: " + str(D.number_of_nodes()) + "\n") - f.write("#edges: " + str(D.number_of_edges()) + "\n\n") - - # INFORMATION ON NODES - f.write("Nodes (" + str(D.number_of_nodes()) + ")\n") - f.write("-" * 80 + "\n") - for n, _ in sorted(D.nodes(data=True), key=sort_key): - f.write(f"{n:<60}\n") - f.write("\n") - # INFORMATION ON EDGES - f.write("Edges (" + str(D.number_of_edges()) + ")\n") - f.write("-" * 80 + "\n") - for a, b, data in sorted(D.edges(data=True), key=sort_key): - f.write( - "({a:<37}, {b:<37}) {w}\n".format(a=a[:37], b=b[:37], w=data["weight"]) - ) - - ######################################################################################################################## # LLVM IR preprocessing ######################################################################################################################## @@ -810,2085 +334,6 @@ def preprocess(data): return preprocessed_data, functions_declared_in_files -######################################################################################################################## -# XFG-building -######################################################################################################################## -def get_identifiers_from_line(line): - """ - Extract identifiers (local, global and label) from a statement - :param line: string: (part of) statement - :return: lists of strings: m_loc, m_glob, m_label, m_label2 - """ - # Find label nodes - m_label = m_label2 = list() - if line.find("label") != -1 or re.match(rgx.local_id_no_perc + r":", line): - m_label1 = re.findall("label (" + rgx.local_id + ")", line) - if re.match(r";