From 4b66981ca12689ea30349eac0889176f6a81840f Mon Sep 17 00:00:00 2001 From: Jan Date: Sat, 16 Dec 2023 01:57:00 +0100 Subject: [PATCH 01/10] Add shadereval tasks --- bigcode_eval/tasks/__init__.py | 3 +- bigcode_eval/tasks/shadereval.py | 234 +++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 bigcode_eval/tasks/shadereval.py diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py index 8162a5f1a..1e6ef366c 100644 --- a/bigcode_eval/tasks/__init__.py +++ b/bigcode_eval/tasks/__init__.py @@ -5,7 +5,7 @@ concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack, instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus, multiple, parity, python_bugs, quixbugs, recode, santacoder_fim, - studenteval, mercury) + studenteval, mercury,shadereval) TASK_REGISTRY = { **apps.create_all_tasks(), @@ -31,6 +31,7 @@ **santacoder_fim.create_all_tasks(), "studenteval": studenteval.StudentEval, "mercury": mercury.Mercury, + **shadereval.create_all_tasks(), } ALL_TASKS = sorted(list(TASK_REGISTRY)) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py new file mode 100644 index 000000000..82bfbcd93 --- /dev/null +++ b/bigcode_eval/tasks/shadereval.py @@ -0,0 +1,234 @@ +# This template file is adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/templates/new_task.py + +# TODO: Remove all TODO comments once the implementation is complete. +""" +Paper-Title: Throwing Shaders at Language Models - Evaluating Creative Code Generation +TODO: Paper-URL: unavailable (unpublished) +Description: ShaderEval aims to be a suite of tasks to evaluate generative model on creative code generation. Espeicically GLSL shadercode. + Task1 is a proof of concept and looks at code completion for returnstatemetns of Shadertoy functions. Exact_match and greedy decoding. +Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval + +Paper-Title: an unknown title for my bachelor thesis (A Comprehensive Evaluation of shadercode generation with language models) +TODO: Paper-URL: unavailable (unapproved) +Description: Doing everything better than before. + Task-1b a better version of Task1 (Return Completion) using a deduplicated dataset as well as more metrics (notImplemented) + Task-2: Function Generation - given a function signature and a docstring, generate the function body, + tested by patching it back into the original shadercode and comparing if the rendered images are the same. (currently in development, open for debate) + Task-3: Semantic generation given a title and description, recursively generate more shadercode untill it renders, scored by CLIP match (in planing...) + + (potential) Instruct variant: all banchmark tasks phrased for instruction tuned models (time permitting) +Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval (could be something else...?) +""" +from lm_eval.base import Task +import evaluate +import datasets +# from ..ShaderCoder.utils import parse_functions, construct_model_context, replace_function #where to import this from(via custom metric?) + +# TODO: Add the BibTeX citation for the task. +_CITATION = """tbd +""" + +def create_all_tasks(): + """assemble all tasks in a dictionary: + - task1: return completion + - task2: function generation + """ + return { + "shadereval-1": ReturnCompletion, + "shadereval-2": FunctionGeneration, + } + +# TODO: Replace `NewTask` with the name of your Task. +class ReturnCompletion(Task): #Task1 + # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` + # dataset as denoted in HuggingFace `datasets`. + DATASET_PATH = "Vipitis/Shadertoys-fine" + # TODO: Add the `DATASET_NAME` string. This is the name of a subset within + # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. + DATASET_NAME = "return_completion" + + def __init__(self): + super().__init__( + # TODO: Specify the list of stop words in `stop_words` for the code generation task \ + # and if the evaluation requires executing the generated code in `requires_execution`. + stop_words=[";"], + requires_execution=False, + ) + + def get_dataset(self): + # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`) + """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" + return self.dataset["test"] + + def fewshot_examples(self): + # TODO: load few-shot examples (from lm_eval/tasks/fewshot_examples) if they exist + """Loads and returns the few-shot examples for the task if they exist.""" + pass + + def get_prompt(self, doc): + # TODO: build the prompt for the language model from a sample `doc` from the dataset + """ + Builds the prompt for the LM to generate from. + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + return doc["body"] + + def get_reference(self, doc): + # TODO: get the reference solution from a sample `doc` from the dataset + """ + Builds the reference solution for the doc (sample from the test dataset). + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + return doc["return_statement"].split(";")[0].strip() + + def postprocess_generation(self, generation, idx): + # TODO: define the postprocessing for the LM generation + """ + Defines the postprocessing for a LM generation. + :param generation: str + code generation from LM + :param idx: int (if needed) + index of doc in the dataset to which the generation belongs + :return: str + """ + generation = generation.split("return")[1] # this works? + return generation.split(";")[0].strip() + + def process_results(self, generations, references): + # TODO: define how the evaluation score is computed from list of \ + # generations and reference solutions + """ + Takes the list of LM generations and evaluates them against ground truth references, + returning the metric for the generations as in {"metric_name": result}. + We encourage to directly load the metric from `evaluate` library to keep the code concise. + :param generations: list(list(str)) + list of lists containing generations + :param references: list(str) + list of str containing refrences + :return: dict[str: float] + """ + exact_match = evaluate.load("exact_match") + generations = [ + generation[0] for generation in generations + ] # unpack one list for some reason? (we zero shot) + return exact_match.compute(predictions=generations, references=references) + + + +# TODO: Replace `NewTask` with the name of your Task. +class FunctionGeneration(Task): #task2 + DATASET_PATH = "Vipitis/Shadertoys-FunctionGeneration-dev" #as a temporary solution to reduce current problems + + # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. + DATASET_NAME = None #this will eventually be a subset for the Shadertoys dataset, but not right now + + def __init__(self): + super().__init__( + # TODO: Specify the list of stop words in `stop_words` for the code generation task \ + # and if the evaluation requires executing the generated code in `requires_execution`. + stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords + requires_execution=True, #we run shadercode - could that be harmful? (all in the metric) + ) + + def get_dataset(self): + # TODO replace with subset once that is set up + return self.dataset["test"] + + def fewshot_examples(self): + # TODO: load few-shot examples (from lm_eval/tasks/fewshot_examples) if they exist + """Loads and returns the few-shot examples for the task if they exist.""" + pass + + def get_prompt(self, doc): + # TODO: build the prompt for the language model from a sample `doc` from the dataset + """ + Builds the prompt for the LM to generate from. + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + + # alternatively, give the whole code up untill the function declaration ends? as in this paper: https://arxiv.org/abs/2306.03203 + return doc["model_ctx"] + + def get_reference(self, doc): + # TODO: get the reference solution from a sample `doc` from the dataset + """ + Builds the reference solution for the doc (sample from the test dataset). + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + return doc["full_code"] #returns full original code + + def remove_last_block(self, code): + """ + Adapted from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/be2a44c2faa29c20b5041d7083acb698eb373309/bigcode_eval/tasks/humanevalpack.py#L275C5-L311C20 + """ + for w in self.stop_words: + if w in code: + code = code[:code.find(w)] + + ### Find the first occassion where a chain of { } is closed?? + open_brackets = 1 + cut = False + for i, c in enumerate(code): + if c == '{': + open_brackets += 1 + elif c == '}': + open_brackets -= 1 + if open_brackets == 0: + code = code[:i+1] + cut = True + break + if not cut: + if '}' in code: + code = code[:code.rfind('}')] + '}' + return code + + def postprocess_generation(self, generation, idx): + # TODO: define the postprocessing for the LM generation + """ + Defines the postprocessing for a LM generation. + :param generation: str + code generation from LM + :param idx: int (if needed) + index of doc in the dataset to which the generation belongs + :return: str + """ + # TODO: trim generation to just the first function -> how do we get the parser in here? + # from: https://huggingface.co/spaces/Vipitis/ShaderCoder/blob/main/utils/tree_utils.py#L45 + # generation = ShaderCoder.utils.parse_functions(generation)[0].text.decode() #not easily imported... + + + # assemble into the full code with just the function replaced + ref = self.dataset["test"][idx] + model_ctx = ref["model_ctx"] + full_code = ref["full_code"] + start, end = ref["func_range"] + gen = self.remove_last_block(generation[len(model_ctx):]) #remove last block to avoid syntax errors + + return full_code[:start] + gen + full_code[end:] #does this patch it together correctly? + + def process_results(self, generations, references): + # TODO: define how the evaluation score is computed from list of \ + # generations and reference solutions + """ + Takes the list of LM generations and evaluates them against ground truth references, + returning the metric for the generations as in {"metric_name": result}. + We encourage to directly load the metric from `evaluate` library to keep the code concise. + :param generations: list(list(str)) + list of lists containing generations + :param references: list(str) + list of str containing refrences + :return: dict[str: float] + """ + shadermatch = evaluate.load("Vipitis/shadermatch") + generations = [ + generation[0] for generation in generations + ] # unpack one list for some reason? (we zero shot) + return shadermatch.compute(predictions=generations, references=references) From 3c67f3263561c340d25ae196ab8e74a75f9cee53 Mon Sep 17 00:00:00 2001 From: Jan Date: Sun, 17 Dec 2023 22:03:45 +0100 Subject: [PATCH 02/10] Fix imports --- bigcode_eval/tasks/shadereval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index 82bfbcd93..e8eb60393 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -19,10 +19,9 @@ (potential) Instruct variant: all banchmark tasks phrased for instruction tuned models (time permitting) Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval (could be something else...?) """ -from lm_eval.base import Task +from bigcode_eval.base import Task import evaluate import datasets -# from ..ShaderCoder.utils import parse_functions, construct_model_context, replace_function #where to import this from(via custom metric?) # TODO: Add the BibTeX citation for the task. _CITATION = """tbd From 2f4c4c2db277d3cf6dd4b42b901b3ff738fc3b80 Mon Sep 17 00:00:00 2001 From: Jan Date: Sun, 17 Dec 2023 23:37:33 +0100 Subject: [PATCH 03/10] Fix missing prompt --- bigcode_eval/tasks/shadereval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index e8eb60393..cbf4eada7 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -211,7 +211,7 @@ def postprocess_generation(self, generation, idx): start, end = ref["func_range"] gen = self.remove_last_block(generation[len(model_ctx):]) #remove last block to avoid syntax errors - return full_code[:start] + gen + full_code[end:] #does this patch it together correctly? + return full_code[:start] + model_ctx + gen + full_code[end:] #does this patch it together correctly? def process_results(self, generations, references): # TODO: define how the evaluation score is computed from list of \ From 85f3420b1348133f2368ddda746bfc5584b208c5 Mon Sep 17 00:00:00 2001 From: Jan Date: Thu, 28 Dec 2023 22:21:02 +0100 Subject: [PATCH 04/10] Fix non unicode characters causing offset --- bigcode_eval/tasks/shadereval.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index cbf4eada7..67af6ddfe 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -175,13 +175,14 @@ def remove_last_block(self, code): ### Find the first occassion where a chain of { } is closed?? open_brackets = 1 cut = False - for i, c in enumerate(code): + for i, c in enumerate(code.encode("utf-8")): + c = chr(c) if c == '{': open_brackets += 1 elif c == '}': open_brackets -= 1 if open_brackets == 0: - code = code[:i+1] + code = code.encode("utf-8")[:i+1].decode("utf-8", "ignore") cut = True break if not cut: @@ -209,9 +210,10 @@ def postprocess_generation(self, generation, idx): model_ctx = ref["model_ctx"] full_code = ref["full_code"] start, end = ref["func_range"] - gen = self.remove_last_block(generation[len(model_ctx):]) #remove last block to avoid syntax errors - - return full_code[:start] + model_ctx + gen + full_code[end:] #does this patch it together correctly? + gen = self.remove_last_block(generation.encode("utf-8")[len(model_ctx.encode("utf-8")):].decode("utf-8")) #remove last block to avoid syntax errors + before_gen = full_code.encode("utf-8")[:start].decode("utf-8") + after_gen = full_code.encode("utf-8")[end:].decode("utf-8") + return before_gen + model_ctx + gen + after_gen #does this patch it together correctly? def process_results(self, generations, references): # TODO: define how the evaluation score is computed from list of \ From 3ba31cd508be59ebd73505a3f77b7b32bffe970c Mon Sep 17 00:00:00 2001 From: Jan Date: Thu, 4 Jan 2024 01:04:10 +0100 Subject: [PATCH 05/10] Add "full" prompt option --- bigcode_eval/tasks/shadereval.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index 67af6ddfe..7e98be100 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -131,6 +131,7 @@ def __init__(self): # and if the evaluation requires executing the generated code in `requires_execution`. stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords requires_execution=True, #we run shadercode - could that be harmful? (all in the metric) + prompt="minimal", # "minimal" or "full". "minimal" is the function header and comments before/after it, "full" is the whole code up untill the function declaration ends ) def get_dataset(self): @@ -146,13 +147,19 @@ def get_prompt(self, doc): # TODO: build the prompt for the language model from a sample `doc` from the dataset """ Builds the prompt for the LM to generate from. + if prompt == "minimal" -> function header and comments before/after it + if prompt == "full" -> also includes full code before the function header :param doc: dict[str: str] sample from the test dataset :return: str """ - - # alternatively, give the whole code up untill the function declaration ends? as in this paper: https://arxiv.org/abs/2306.03203 - return doc["model_ctx"] + model_context = "" + if self.prompt == "full": + # alternatively, give the whole code up untill the function declaration ends? as in this paper: https://arxiv.org/abs/2306.03203 + model_context += doc["full_code"].encode("utf-8")[:doc["func_range"][0]].decode("utf-8") #returns full original code up untill the function declaration ends + # only have one alternative, but could be more? + model_context += doc["model_ctx"] + return model_context def get_reference(self, doc): # TODO: get the reference solution from a sample `doc` from the dataset @@ -172,7 +179,7 @@ def remove_last_block(self, code): if w in code: code = code[:code.find(w)] - ### Find the first occassion where a chain of { } is closed?? + ### Find the first occassion where a chain of { } is closed?? open_brackets = 1 cut = False for i, c in enumerate(code.encode("utf-8")): @@ -210,9 +217,13 @@ def postprocess_generation(self, generation, idx): model_ctx = ref["model_ctx"] full_code = ref["full_code"] start, end = ref["func_range"] - gen = self.remove_last_block(generation.encode("utf-8")[len(model_ctx.encode("utf-8")):].decode("utf-8")) #remove last block to avoid syntax errors before_gen = full_code.encode("utf-8")[:start].decode("utf-8") after_gen = full_code.encode("utf-8")[end:].decode("utf-8") + + if self.prompt == "full": + gen = self.remove_last_block(generation.encode("utf-8")[start + len(model_ctx.encode("utf-8")):].decode("utf-8")) + else: + gen = self.remove_last_block(generation.encode("utf-8")[len(model_ctx.encode("utf-8")):].decode("utf-8")) #remove last block to avoid syntax errors return before_gen + model_ctx + gen + after_gen #does this patch it together correctly? def process_results(self, generations, references): From f183a17eebdd8b2d9b4ba5bf71ba6bac95846ddc Mon Sep 17 00:00:00 2001 From: Jan Date: Thu, 4 Jan 2024 01:08:50 +0100 Subject: [PATCH 06/10] Fix kwarg in init --- bigcode_eval/tasks/shadereval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index 7e98be100..7e76cf196 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -125,14 +125,14 @@ class FunctionGeneration(Task): #task2 # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. DATASET_NAME = None #this will eventually be a subset for the Shadertoys dataset, but not right now - def __init__(self): + def __init__(self, prompt="minimal"): super().__init__( # TODO: Specify the list of stop words in `stop_words` for the code generation task \ # and if the evaluation requires executing the generated code in `requires_execution`. stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords requires_execution=True, #we run shadercode - could that be harmful? (all in the metric) - prompt="minimal", # "minimal" or "full". "minimal" is the function header and comments before/after it, "full" is the whole code up untill the function declaration ends ) + self.prompt = prompt # "minimal" or "full". "minimal" is the function header and comments before/after it, "full" is the whole code up untill the function declaration ends def get_dataset(self): # TODO replace with subset once that is set up From de0bab714bddd009bbf245ce06e177c6f08c47a7 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 8 Jan 2024 19:15:34 +0100 Subject: [PATCH 07/10] Add tagging for "incomplete generations" --- bigcode_eval/tasks/shadereval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index 7e76cf196..3e5790b02 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -195,6 +195,8 @@ def remove_last_block(self, code): if not cut: if '}' in code: code = code[:code.rfind('}')] + '}' + else: + code = code + "// incomplete generation! \n" return code def postprocess_generation(self, generation, idx): From fcafbab718aedb885db494f4e1aa8b475054d9c3 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 4 Mar 2024 21:27:31 +0100 Subject: [PATCH 08/10] Add documentation --- README.md | 1 + docs/README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/README.md b/README.md index aa3bb89e3..60c49ec5d 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Below are the features and tasks of this framework: - `StarCoderFIM`: which uses the default FIM tokens `"", "", ""`, and - `SantaCoderFIM`: which uses SantaCoder FIM tokens `"", "", ""` - [Mercury](https://huggingface.co/datasets/Elfsong/Mercury) for evaluating computational efficiency of **Python** code generation. + - Shadereval for **GLSL** code understanding ([task1](https://huggingface.co/spaces/Vipitis/ShaderEval)) and generation ([task2](https://huggingface.co/spaces/Vipitis/shadermatch)) More details about each task can be found in the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md). ## Setup diff --git a/docs/README.md b/docs/README.md index 903c6a122..a944928f4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -426,6 +426,46 @@ accelerate launch main.py \ --metric_output_path .json ``` +### Shadereval +[Shadereval](tbd.) explores "creative" code generation. Fragment shaders are sourced from Shadertoy.com and curated into the [Shadertoys](https://huggingface.co/datasets/Vipitis/Shadertoys) dataset. The task specific datasets are build from the Shadertoys dataset and therefore share a common train/test split. + +Task-1: **ReturnCompletion** provides a function header and body, so the model generates a matching return statement. Generations are evaluated by `exact-match` therefore does not require code execution. The original publication uses greedy decoding and only 300 samples. + +```bash +accelerate launch main.py \ + --model \ + --tasks shadereval-1 \ + --n_samples 300 \ + --do_sample False \ +``` + +Task-2: **FunctionGeneration** parses comments directly before or after the function header as model input. The model is expected to generate a complete function that is syntactially sound. Generated functions are inserted in the original shader program for evaluation. A custom metric is hosted in the [demo space](https://huggingface.co/spaces/Vipitis/shadermatch) which render frames to compare. This requires an additional dependency [wgpu-shadertoy](https://github.com/pygfx/shadertoy). It's recommended to generate generations first and then evaluate them later. +The reference uses greedy decoding and fp16 for the first 300 examples. + +```bash +accelerate launch main.py \ + --model \ + --tasks shadereval-2 \ + --generation_only \ + --save_generations_path "saved_generations.json" \ + --allow_code_execution \ + --limit 300 \ + --do_sample False \ --precision fp16 \ +``` + +To evaluate later run the following command: + +```bash +accelerate launch main.py \ + --model \ + --tasks shadereval-2 \ + --load_generations_path "saved_generations.json" \ + --allow_code_execution \ + --limit 300 \ + --metric_output_path "eval_results.json" \ + --precision fp16 +``` + ## Code generation benchmarks without unit tests For these tasks, we do single generations and compare the generated code against reference solutions and compute BLEU score. For the following tasks, we use a two-shot setting where we include 2 inputs and their solutions in the prompt, all preceded by an instruction such as: ` "Answer the following instructions in a one line SQL query:\n"`. The solutions consist of one line so we stop the generation when a new line is generated. 3 languages are present: Python, SQL and Java. From 2b17284377917450f75e44bb23373769a5253cf2 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 16 Sep 2024 02:06:57 +0200 Subject: [PATCH 09/10] Update FunctionCompletion task --- bigcode_eval/tasks/shadereval.py | 57 ++++++++++++++------------------ docs/README.md | 18 +++++----- 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index 3e5790b02..e9177e732 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -8,16 +8,10 @@ Task1 is a proof of concept and looks at code completion for returnstatemetns of Shadertoy functions. Exact_match and greedy decoding. Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval -Paper-Title: an unknown title for my bachelor thesis (A Comprehensive Evaluation of shadercode generation with language models) -TODO: Paper-URL: unavailable (unapproved) -Description: Doing everything better than before. - Task-1b a better version of Task1 (Return Completion) using a deduplicated dataset as well as more metrics (notImplemented) - Task-2: Function Generation - given a function signature and a docstring, generate the function body, - tested by patching it back into the original shadercode and comparing if the rendered images are the same. (currently in development, open for debate) - Task-3: Semantic generation given a title and description, recursively generate more shadercode untill it renders, scored by CLIP match (in planing...) - - (potential) Instruct variant: all banchmark tasks phrased for instruction tuned models (time permitting) -Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval (could be something else...?) +Paper-Title: Evaluating language models for computer graphics code completion +TODO: Paper-URL: unavailable (unpublished) +Description: Function Completion task for GLSL shadercode. Metric statically compares and then runs generated code to compare rendered frames with the refernece. +Homepage: https://huggingface.co/spaces/Vipitis/Shadermatch """ from bigcode_eval.base import Task import evaluate @@ -41,7 +35,7 @@ def create_all_tasks(): class ReturnCompletion(Task): #Task1 # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` # dataset as denoted in HuggingFace `datasets`. - DATASET_PATH = "Vipitis/Shadertoys-fine" + DATASET_PATH = "Vipitis/Shadertoys-fine" # now defunct. # TODO: Add the `DATASET_NAME` string. This is the name of a subset within # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. DATASET_NAME = "return_completion" @@ -120,19 +114,21 @@ def process_results(self, generations, references): # TODO: Replace `NewTask` with the name of your Task. class FunctionGeneration(Task): #task2 - DATASET_PATH = "Vipitis/Shadertoys-FunctionGeneration-dev" #as a temporary solution to reduce current problems + DATASET_PATH = "Vipitis/Shadereval-experiments-dev" #as a temporary solution to reduce current problems # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. DATASET_NAME = None #this will eventually be a subset for the Shadertoys dataset, but not right now - def __init__(self, prompt="minimal"): + def __init__(self): super().__init__( # TODO: Specify the list of stop words in `stop_words` for the code generation task \ # and if the evaluation requires executing the generated code in `requires_execution`. - stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords + # stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords + # TODO: stopwords can cause incorrect early stopping, so we don't edn up using them. I am considering using guided generation with tree-sitter to do early stopping. + stop_words=[], #set it's to Falsy? requires_execution=True, #we run shadercode - could that be harmful? (all in the metric) ) - self.prompt = prompt # "minimal" or "full". "minimal" is the function header and comments before/after it, "full" is the whole code up untill the function declaration ends + self._metric = evaluate.load("Vipitis/shadermatch") #load the metric from the evaluate library def get_dataset(self): # TODO replace with subset once that is set up @@ -146,20 +142,11 @@ def fewshot_examples(self): def get_prompt(self, doc): # TODO: build the prompt for the language model from a sample `doc` from the dataset """ - Builds the prompt for the LM to generate from. - if prompt == "minimal" -> function header and comments before/after it - if prompt == "full" -> also includes full code before the function header :param doc: dict[str: str] sample from the test dataset :return: str """ - model_context = "" - if self.prompt == "full": - # alternatively, give the whole code up untill the function declaration ends? as in this paper: https://arxiv.org/abs/2306.03203 - model_context += doc["full_code"].encode("utf-8")[:doc["func_range"][0]].decode("utf-8") #returns full original code up untill the function declaration ends - # only have one alternative, but could be more? - model_context += doc["model_ctx"] - return model_context + return doc["model_inp"] def get_reference(self, doc): # TODO: get the reference solution from a sample `doc` from the dataset @@ -169,12 +156,13 @@ def get_reference(self, doc): sample from the test dataset :return: str """ - return doc["full_code"] #returns full original code + return doc["image_code"] #returns full original code def remove_last_block(self, code): """ Adapted from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/be2a44c2faa29c20b5041d7083acb698eb373309/bigcode_eval/tasks/humanevalpack.py#L275C5-L311C20 """ + # TODO: can be removed for w in self.stop_words: if w in code: code = code[:code.find(w)] @@ -209,11 +197,14 @@ def postprocess_generation(self, generation, idx): index of doc in the dataset to which the generation belongs :return: str """ - # TODO: trim generation to just the first function -> how do we get the parser in here? - # from: https://huggingface.co/spaces/Vipitis/ShaderCoder/blob/main/utils/tree_utils.py#L45 - # generation = ShaderCoder.utils.parse_functions(generation)[0].text.decode() #not easily imported... - + row = self.dataset["test"][idx] + truncated = self._metric.truncate_generation(model_inp="", generation=generation) + # TODO: the metric methods will be renaming their args to be more broadly useable.. maybe even refactor the bit at the top. + altered = self._metric.replace_body(ref_code=row["image_code"], altered_body=truncated, end_header_byte=row["func_bytes"][0], end_function_byte=row["func_bytes"][4]) + return altered + + # TODO: remove the old code # assemble into the full code with just the function replaced ref = self.dataset["test"][idx] model_ctx = ref["model_ctx"] @@ -241,8 +232,10 @@ def process_results(self, generations, references): list of str containing refrences :return: dict[str: float] """ - shadermatch = evaluate.load("Vipitis/shadermatch") + # shadermatch = evaluate.load("Vipitis/shadermatch") generations = [ generation[0] for generation in generations ] # unpack one list for some reason? (we zero shot) - return shadermatch.compute(predictions=generations, references=references) + results = self._metric.compute(predictions=generations, references=references) + # this also includes a list of all individual labels (in order). + return results \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index a944928f4..1ffea3637 100644 --- a/docs/README.md +++ b/docs/README.md @@ -427,9 +427,9 @@ accelerate launch main.py \ ``` ### Shadereval -[Shadereval](tbd.) explores "creative" code generation. Fragment shaders are sourced from Shadertoy.com and curated into the [Shadertoys](https://huggingface.co/datasets/Vipitis/Shadertoys) dataset. The task specific datasets are build from the Shadertoys dataset and therefore share a common train/test split. +[Shadereval](tbd.) explores "creative" code generation. Fragment GLSL shaders are sourced from Shadertoy.com and curated into the [Shadertoys](https://github.com/Vipitis/shadertoys-dataset) dataset. The task specific datasets are build from the Shadertoys dataset. -Task-1: **ReturnCompletion** provides a function header and body, so the model generates a matching return statement. Generations are evaluated by `exact-match` therefore does not require code execution. The original publication uses greedy decoding and only 300 samples. +Task-1: **ReturnCompletion** provides a function header and body, so the model generates a matching return statement. Generations are evaluated by `exact-match` therefore does not require code execution. The original publication uses greedy decoding and only 300 samples. Dataset is now private due to a takedown notice, please contact for access. This is only meant as a prototype task. ```bash accelerate launch main.py \ @@ -439,18 +439,20 @@ accelerate launch main.py \ --do_sample False \ ``` -Task-2: **FunctionGeneration** parses comments directly before or after the function header as model input. The model is expected to generate a complete function that is syntactially sound. Generated functions are inserted in the original shader program for evaluation. A custom metric is hosted in the [demo space](https://huggingface.co/spaces/Vipitis/shadermatch) which render frames to compare. This requires an additional dependency [wgpu-shadertoy](https://github.com/pygfx/shadertoy). It's recommended to generate generations first and then evaluate them later. -The reference uses greedy decoding and fp16 for the first 300 examples. +Task-2: **FunctionGeneration** parses comments directly before the function header and the header itself as model input. The model is expected to generate a complete function that is syntactially sound. Generated functions are inserted in the original shader program for evaluation. A custom metric is hosted in the [demo space](https://huggingface.co/spaces/Vipitis/shadermatch) which renders frames to compare. This requires an additional dependency [wgpu-shadertoy](https://github.com/pygfx/shadertoy) as well as [tree-sitter-glsl](https://github.com/tree-sitter-grammars/tree-sitter-glsl). It's recommended to generate generations first and then evaluate them later. +The original publication greedily decodes 512 tokens at BF16. However custom sampling strategies and additional prompts can be attempted, as long as accurately communicated with claimed results. ```bash accelerate launch main.py \ --model \ --tasks shadereval-2 \ --generation_only \ + --save_generations \ --save_generations_path "saved_generations.json" \ --allow_code_execution \ - --limit 300 \ - --do_sample False \ --precision fp16 \ + --do_sample False \ + --precision bf16 \ + --max_length_generation 512 ``` To evaluate later run the following command: @@ -461,9 +463,9 @@ accelerate launch main.py \ --tasks shadereval-2 \ --load_generations_path "saved_generations.json" \ --allow_code_execution \ - --limit 300 \ --metric_output_path "eval_results.json" \ - --precision fp16 + --precision bf16 \ + --max_length_generation 512 ``` ## Code generation benchmarks without unit tests From b08fdc1377dd86366cfc7113edf8762b19e7f950 Mon Sep 17 00:00:00 2001 From: Jan Date: Wed, 13 Nov 2024 01:51:46 +0100 Subject: [PATCH 10/10] update dataset link --- bigcode_eval/tasks/shadereval.py | 86 ++++++-------------------------- 1 file changed, 15 insertions(+), 71 deletions(-) diff --git a/bigcode_eval/tasks/shadereval.py b/bigcode_eval/tasks/shadereval.py index e9177e732..a3ad8c2b5 100644 --- a/bigcode_eval/tasks/shadereval.py +++ b/bigcode_eval/tasks/shadereval.py @@ -8,7 +8,7 @@ Task1 is a proof of concept and looks at code completion for returnstatemetns of Shadertoy functions. Exact_match and greedy decoding. Homepage: https://huggingface.co/spaces/Vipitis/ShaderEval -Paper-Title: Evaluating language models for computer graphics code completion +Paper-Title: Evaluating Language Models for Computer Graphics Code Completion TODO: Paper-URL: unavailable (unpublished) Description: Function Completion task for GLSL shadercode. Metric statically compares and then runs generated code to compare rendered frames with the refernece. Homepage: https://huggingface.co/spaces/Vipitis/Shadermatch @@ -28,7 +28,7 @@ def create_all_tasks(): """ return { "shadereval-1": ReturnCompletion, - "shadereval-2": FunctionGeneration, + "shadereval-2": FunctionCompletion, } # TODO: Replace `NewTask` with the name of your Task. @@ -112,35 +112,26 @@ def process_results(self, generations, references): -# TODO: Replace `NewTask` with the name of your Task. -class FunctionGeneration(Task): #task2 - DATASET_PATH = "Vipitis/Shadereval-experiments-dev" #as a temporary solution to reduce current problems - - # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. - DATASET_NAME = None #this will eventually be a subset for the Shadertoys dataset, but not right now +class FunctionCompletion(Task): #task2 + DATASET_PATH = "Vipitis/Shadereval-inputs" + DATASET_NAME = None + # revision hash: 274eb4d3017d59da2a1f48bc7194be1545de919f (or v0.4 tag - TBD) def __init__(self): super().__init__( - # TODO: Specify the list of stop words in `stop_words` for the code generation task \ - # and if the evaluation requires executing the generated code in `requires_execution`. - # stop_words=["\nfloat ", "\nvec", "\nint", "\nvoid", "\nmat"], #new function starts... so all the keywords - # TODO: stopwords can cause incorrect early stopping, so we don't edn up using them. I am considering using guided generation with tree-sitter to do early stopping. - stop_words=[], #set it's to Falsy? - requires_execution=True, #we run shadercode - could that be harmful? (all in the metric) + stop_words=[], #early stopping via stop words has impacted generations meaningfully so it's not done! + requires_execution=True, #we run shadercode which can be unsafe! ) self._metric = evaluate.load("Vipitis/shadermatch") #load the metric from the evaluate library def get_dataset(self): - # TODO replace with subset once that is set up return self.dataset["test"] def fewshot_examples(self): - # TODO: load few-shot examples (from lm_eval/tasks/fewshot_examples) if they exist """Loads and returns the few-shot examples for the task if they exist.""" pass def get_prompt(self, doc): - # TODO: build the prompt for the language model from a sample `doc` from the dataset """ :param doc: dict[str: str] sample from the test dataset @@ -149,7 +140,6 @@ def get_prompt(self, doc): return doc["model_inp"] def get_reference(self, doc): - # TODO: get the reference solution from a sample `doc` from the dataset """ Builds the reference solution for the doc (sample from the test dataset). :param doc: dict[str: str] @@ -158,37 +148,7 @@ def get_reference(self, doc): """ return doc["image_code"] #returns full original code - def remove_last_block(self, code): - """ - Adapted from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/be2a44c2faa29c20b5041d7083acb698eb373309/bigcode_eval/tasks/humanevalpack.py#L275C5-L311C20 - """ - # TODO: can be removed - for w in self.stop_words: - if w in code: - code = code[:code.find(w)] - - ### Find the first occassion where a chain of { } is closed?? - open_brackets = 1 - cut = False - for i, c in enumerate(code.encode("utf-8")): - c = chr(c) - if c == '{': - open_brackets += 1 - elif c == '}': - open_brackets -= 1 - if open_brackets == 0: - code = code.encode("utf-8")[:i+1].decode("utf-8", "ignore") - cut = True - break - if not cut: - if '}' in code: - code = code[:code.rfind('}')] + '}' - else: - code = code + "// incomplete generation! \n" - return code - def postprocess_generation(self, generation, idx): - # TODO: define the postprocessing for the LM generation """ Defines the postprocessing for a LM generation. :param generation: str @@ -197,31 +157,15 @@ def postprocess_generation(self, generation, idx): index of doc in the dataset to which the generation belongs :return: str """ - + # these postprocessing steps are implemented in the metric itself: https://huggingface.co/spaces/Vipitis/shadermatch/blob/main/shadermatch.py#L139-L168 + # and rely in additional dependencies: tree-sitter-glsl, [maybe also wgpu-py, glfw] row = self.dataset["test"][idx] truncated = self._metric.truncate_generation(model_inp="", generation=generation) - # TODO: the metric methods will be renaming their args to be more broadly useable.. maybe even refactor the bit at the top. + # TODO: the metric methods will be renaming their args to be more broadly useable. altered = self._metric.replace_body(ref_code=row["image_code"], altered_body=truncated, end_header_byte=row["func_bytes"][0], end_function_byte=row["func_bytes"][4]) return altered - # TODO: remove the old code - # assemble into the full code with just the function replaced - ref = self.dataset["test"][idx] - model_ctx = ref["model_ctx"] - full_code = ref["full_code"] - start, end = ref["func_range"] - before_gen = full_code.encode("utf-8")[:start].decode("utf-8") - after_gen = full_code.encode("utf-8")[end:].decode("utf-8") - - if self.prompt == "full": - gen = self.remove_last_block(generation.encode("utf-8")[start + len(model_ctx.encode("utf-8")):].decode("utf-8")) - else: - gen = self.remove_last_block(generation.encode("utf-8")[len(model_ctx.encode("utf-8")):].decode("utf-8")) #remove last block to avoid syntax errors - return before_gen + model_ctx + gen + after_gen #does this patch it together correctly? - def process_results(self, generations, references): - # TODO: define how the evaluation score is computed from list of \ - # generations and reference solutions """ Takes the list of LM generations and evaluates them against ground truth references, returning the metric for the generations as in {"metric_name": result}. @@ -232,10 +176,10 @@ def process_results(self, generations, references): list of str containing refrences :return: dict[str: float] """ - # shadermatch = evaluate.load("Vipitis/shadermatch") - generations = [ - generation[0] for generation in generations - ] # unpack one list for some reason? (we zero shot) + # one candidate per generation, as to be unpacked here. + generations = [generation[0] for generation in generations] + # the metric is implemented as an evaluate.metric here: https://huggingface.co/spaces/Vipitis/shadermatch/blob/main/shadermatch.py + # this defenitely requires wgpu-py, glfw, wgpu-shadertoy, tree-sitter-glsl, numpy, Pillow and tqdm results = self._metric.compute(predictions=generations, references=references) # this also includes a list of all individual labels (in order). return results \ No newline at end of file