From 4d8c521fb8477ceda45f2adad10979624ee0ca6c Mon Sep 17 00:00:00 2001 From: BeachWang <1400012807@pku.edu.cn> Date: Fri, 2 Aug 2024 17:46:37 +0800 Subject: [PATCH] sandbox bench experiment workflow (#364) * FVD and ISV for video eval * restore tools init * restore tools init * pre-commit done * add FID KID IS PR and PRV metrics * add KVD metric * fix doc * allow relative path * fix sample 50000 image * fvd sandbox * fvd sandbox test done * precommit done * easyanimate train and infer in sandbox * divide dataset pipline * fix data num for each partition * pre-commit done * test sandbox for videos done * fix executor * fix executor * check datalen * sort data for partition * sort data for partition * fix video_aspect_ratio_filter * fix video_aspect_ratio_filter * tensor stats to float * precommit done * fix words num filter * pre-commit done * add seed for train and infer * add seed for easyanimate * sandbox rebuild v1 * fix empty frames * switch * fix conflict * fix hpo 3sigma * after pre-commit * sandbox readme zh * finish doc * remove training limit * other_configs -> extra_configs * other_configs -> extra_configs * res_name -> meta_name * hooker -> hook * analyze -> analyse * after pre-commit * analyse -> analyze * analyser.py -> analyzer.py * analyser.py -> analyzer.py * analyser.py -> analyzer.py * regist -> register, DICT -> MAPPING * range_specified_field_selector * pipline test done * dataset in readme * update readme * pre-commit done * rm experiment name in dj * add init dataset * fix auto_evaluation_helm readme * remove easyanimate code * shorten diff --------- Co-authored-by: binke --- .gitignore | 1 - .pre-commit-config.yaml | 3 +- README.md | 22 +- README_ZH.md | 2 +- configs/config_all.yaml | 20 +- configs/data_juicer_recipes/README.md | 1 + configs/data_juicer_recipes/README_ZH.md | 3 +- configs/demo/bench/1_single_op_pipline.yaml | 68 + configs/demo/bench/2_multi_op_pipline.yaml | 58 + configs/demo/bench/3_duplicate_pipline.yaml | 30 + configs/demo/bench/model_infer.yaml | 25 + configs/demo/bench/model_train.yaml | 31 + configs/demo/bench/model_train_2_epoch.yaml | 31 + .../vbench_eval.yaml} | 12 +- configs/demo/sandbox/sandbox.yaml | 1 + data_juicer/core/sandbox/evaluators.py | 24 +- data_juicer/core/sandbox/factories.py | 60 +- data_juicer/core/sandbox/hooks.py | 56 +- data_juicer/core/sandbox/model_executors.py | 99 +- .../ops/filter/video_aesthetics_filter.py | 34 +- .../ops/filter/video_duration_filter.py | 6 +- .../video_frames_text_similarity_filter.py | 35 +- data_juicer/ops/filter/video_nsfw_filter.py | 33 +- .../ops/filter/video_watermark_filter.py | 30 +- data_juicer/ops/selector/__init__.py | 10 +- data_juicer/ops/selector/random_selector.py | 49 + .../range_specified_field_selector.py | 109 + .../selector/topk_specified_field_selector.py | 16 +- data_juicer/utils/common_utils.py | 21 + demos/auto_evaluation_helm/README_ZH.md | 6 +- demos/data/demo-dataset-videos.jsonl | 6 + docs/Operators.md | 4 +- docs/Operators_ZH.md | 4 +- docs/Sandbox-ZH.md | 48 +- docs/Sandbox.md | 58 +- environments/sandbox_requires.txt | 17 +- environments/science_requires.txt | 2 +- scripts/README.md | 2 +- tests/ops/selector/test_random_selector.py | 274 +++ .../test_range_specified_field_selector.py | 641 ++++++ thirdparty/{ => LLM_ecosystems}/README.md | 2 +- thirdparty/{ => LLM_ecosystems}/README_ZH.md | 2 +- .../{ => LLM_ecosystems}/patch/helm.diff | 0 .../{ => LLM_ecosystems}/patch/megatron.diff | 0 thirdparty/{ => LLM_ecosystems}/setup_helm.sh | 0 .../{ => LLM_ecosystems}/setup_megatron.sh | 0 thirdparty/models/README.md | 9 + thirdparty/models/README_ZH.md | 9 + thirdparty/models/patch/easyanimate.diff | 1852 +++++++++++++++++ thirdparty/models/setup_easyanimate.sh | 11 + tools/distributed_deduplication/README.md | 2 +- tools/distributed_deduplication/README_ZH.md | 2 +- tools/hpo/README.md | 16 +- tools/hpo/README_ZH.md | 14 +- tools/mm_eval/{ => vbench_metrics}/README.md | 0 .../mm_eval/{ => vbench_metrics}/README_ZH.md | 0 .../vbench_metrics/VBench_mini_info.json | 109 + tools/multimodal/README.md | 68 +- tools/multimodal/README_ZH.md | 6 +- tools/sandbox_starter.py | 31 +- 60 files changed, 3817 insertions(+), 268 deletions(-) create mode 100644 configs/demo/bench/1_single_op_pipline.yaml create mode 100644 configs/demo/bench/2_multi_op_pipline.yaml create mode 100644 configs/demo/bench/3_duplicate_pipline.yaml create mode 100644 configs/demo/bench/model_infer.yaml create mode 100644 configs/demo/bench/model_train.yaml create mode 100644 configs/demo/bench/model_train_2_epoch.yaml rename configs/demo/{sandbox/vbench_eval_config.yaml => bench/vbench_eval.yaml} (63%) create mode 100644 data_juicer/ops/selector/random_selector.py create mode 100644 data_juicer/ops/selector/range_specified_field_selector.py create mode 100644 data_juicer/utils/common_utils.py create mode 100644 demos/data/demo-dataset-videos.jsonl create mode 100644 tests/ops/selector/test_random_selector.py create mode 100644 tests/ops/selector/test_range_specified_field_selector.py rename thirdparty/{ => LLM_ecosystems}/README.md (97%) rename thirdparty/{ => LLM_ecosystems}/README_ZH.md (96%) rename thirdparty/{ => LLM_ecosystems}/patch/helm.diff (100%) rename thirdparty/{ => LLM_ecosystems}/patch/megatron.diff (100%) rename thirdparty/{ => LLM_ecosystems}/setup_helm.sh (100%) rename thirdparty/{ => LLM_ecosystems}/setup_megatron.sh (100%) create mode 100644 thirdparty/models/README.md create mode 100644 thirdparty/models/README_ZH.md create mode 100644 thirdparty/models/patch/easyanimate.diff create mode 100755 thirdparty/models/setup_easyanimate.sh rename tools/mm_eval/{ => vbench_metrics}/README.md (100%) rename tools/mm_eval/{ => vbench_metrics}/README_ZH.md (100%) create mode 100644 tools/mm_eval/vbench_metrics/VBench_mini_info.json diff --git a/.gitignore b/.gitignore index d5d1d0782..15c65f412 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # data & resources -models/ outputs/ assets/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index df324124c..f8b87da36 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,7 @@ exclude: | docs/.*| tests/.*| demos/.*| - tools/mm_eval/inception_metrics.*| + tools/mm_eval/inception_metrics/.*| + thirdparty/easy_animate/.*| .*\.md )$ diff --git a/README.md b/README.md index 4ee07206f..e56f28bb7 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ In this new version, we support more features for **multimodal data (including v - [2024-02-20] We have actively maintained an *awesome list of LLM-Data*, welcome to [visit](docs/awesome_llm_data.md) and contribute! - [2024-02-05] Our paper has been accepted by SIGMOD'24 industrial track! - [2024-01-10] Discover new horizons in "Data Mixture"—Our second data-centric LLM competition has kicked off! Please visit the competition's [official website](https://tianchi.aliyun.com/competition/entrance/532174) for more information. -- [2024-01-05] We release **Data-Juicer v0.1.3** now! +- [2024-01-05] We release **Data-Juicer v0.1.3** now! In this new version, we support **more Python versions** (3.8-3.10), and support **multimodal** dataset [converting](tools/multimodal/README.md)/[processing](docs/Operators.md) (Including texts, images, and audios. More modalities will be supported in the future). Besides, our paper is also updated to [v3](https://arxiv.org/abs/2309.02033). - [2023-10-13] Our first data-centric LLM competition begins! Please @@ -94,8 +94,8 @@ Table of Contents dedicated [toolkits](#documentation), designed to function independently of specific multimodal LLM datasets and processing pipelines. -- **Data-in-the-loop & Sandbox**: Supporting one-stop data-model collaborative development, enabling rapid iteration - through the [sandbox laboratory](docs/Sandbox.md), and providing features such as feedback loops based on data and model, +- **Data-in-the-loop & Sandbox**: Supporting one-stop data-model collaborative development, enabling rapid iteration + through the [sandbox laboratory](docs/Sandbox.md), and providing features such as feedback loops based on data and model, visualization, and multidimensional automatic evaluation, so that you can better understand and improve your data and models. ![Data-in-the-loop](https://img.alicdn.com/imgextra/i2/O1CN017U7Zz31Y7XtCJ5GOz_!!6000000003012-0-tps-3640-1567.jpg) @@ -194,11 +194,11 @@ The dependency options are listed below: pip install py-data-juicer ``` -- **Note**: +- **Note**: - only the basic APIs in `data_juicer` and two basic tools (data [processing](#data-processing) and [analysis](#data-analysis)) are available in this way. If you want customizable and complete functions, we recommend you install `data_juicer` [from source](#from-source). - - The release versions from pypi have a certain lag compared to the latest version from source. + - The release versions from pypi have a certain lag compared to the latest version from source. So if you want to follow the latest functions of `data_juicer`, we recommend you install [from source](#from-source). ### Using Docker @@ -215,7 +215,7 @@ pip install py-data-juicer ```shell docker build -t datajuicer/data-juicer: . ``` - + - The format of `` is like `v0.2.0`, which is the same as release version tag. ### Installation check @@ -413,20 +413,20 @@ docker exec -it bash Data-Juicer is released under Apache License 2.0. ## Contributing -We are in a rapidly developing field and greatly welcome contributions of new -features, bug fixes and better documentations. Please refer to +We are in a rapidly developing field and greatly welcome contributions of new +features, bug fixes and better documentations. Please refer to [How-to Guide for Developers](docs/DeveloperGuide.md). If you have any questions, please join our [discussion groups](README.md). ## Acknowledgement Data-Juicer is used across various LLM products and research initiatives, -including industrial LLMs from Alibaba Cloud's Tongyi, such as Dianjin for -financial analysis, and Zhiwen for reading assistant, as well as the Alibaba +including industrial LLMs from Alibaba Cloud's Tongyi, such as Dianjin for +financial analysis, and Zhiwen for reading assistant, as well as the Alibaba Cloud's platform for AI (PAI). We look forward to more of your experience, suggestions and discussions for collaboration! -Data-Juicer thanks and refers to several community projects, such as +Data-Juicer thanks and refers to several community projects, such as [Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1), [Pile](https://huggingface.co/datasets/EleutherAI/pile), [Alpaca-Cot](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT), [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), [DeepSpeed](https://www.deepspeed.ai/), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), [Beam](https://github.com/apache/beam), [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness), [HELM](https://github.com/stanford-crfm/helm), .... diff --git a/README_ZH.md b/README_ZH.md index 518182d2a..473b560e8 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -193,7 +193,7 @@ pip install py-data-juicer ```shell docker build -t datajuicer/data-juicer: . ``` - + - ``的格式类似于`v0.2.0`,与发布(Release)的版本号相同。 ### 安装校验 diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 7a66292ce..8273a30f4 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -49,7 +49,6 @@ data_probe_algo: 'uniform' # sampling algorithm data_probe_ratio: 1.0 # the sampling ratio to the original dataset size. It's 1.0 in default. Only used for dataset sampling. hpo_config: null # path to a configuration file when using auto-HPO tool. - # process schedule: a list of several process operators with their arguments process: # Mapper ops. Most of these ops need no arguments. @@ -496,13 +495,22 @@ process: ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations # Selector ops - - topk_specified_field_selector: # selector to select top samples based on the sorted specified field - field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.' - top_ratio: # ratio of selected top samples - topk: # number of selected top sample - reverse: True # determine the sorting rule, if reverse=True, then sort in descending order - frequency_specified_field_selector: # selector to select samples based on the sorted frequency of specified field value field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.' top_ratio: # ratio of selected top specified field value topk: # number of selected top specified field value reverse: True # determine the sorting rule, if reverse=True, then sort in descending order + - random_selector: # selector to random select samples + select_ratio: # the ratio to be sampled + select_num: # the number to be sampled + - range_specified_field_selector: # selector to select a range of samples based on the sorted specified field value from smallest to largest. + field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.' + lower_percentile: # the lower bound of the percentile to be sampled + upper_percentile: # the upper bound of the percentile to be sampled + lower_rank: # the lower rank of the percentile to be sampled + upper_rank: # the upper rank of the percentile to be sampled + - topk_specified_field_selector: # selector to select top samples based on the sorted specified field + field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.' + top_ratio: # ratio of selected top samples + topk: # number of selected top sample + reverse: True # determine the sorting rule, if reverse=True, then sort in descending order diff --git a/configs/data_juicer_recipes/README.md b/configs/data_juicer_recipes/README.md index 57073fb2a..ee5068047 100644 --- a/configs/data_juicer_recipes/README.md +++ b/configs/data_juicer_recipes/README.md @@ -41,6 +41,7 @@ We use simple 3-σ rule to set the hyperparameters for ops in each recipe. | subset | #samples before | #samples after | keep ratio | config link | data link | source | |---------------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| | LLaVA pretrain (LCS-558k) | 558,128 | 500,380 | 89.65% | [llava-pretrain-refine.yaml](llava-pretrain-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/LLaVA-1.5/public/llava-pretrain-refine-result.json)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/llava-pretrain-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/llava-pretrain-refined-by-data-juicer) | [LLaVA-1.5](https://github.com/haotian-liu/LLaVA) | +| Data-Juicer-T2V | 1,217,346 | 147,176 | 12.09% | [2_multi_op_pipline.yaml](../demo/bench/2_multi_op_pipline.yaml) | [Aliyun](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_optimal_data_pool.zip)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/data-juicer-t2v-optimal-data-pool)
[HuggingFace](https://huggingface.co/datasets/datajuicer/data-juicer-t2v-optimal-data-pool) | [InternVid (606k)](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid)
[Panda-70M (605k)](https://github.com/snap-research/Panda-70M)
[MSR-VTT (6k)](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/) | ### Evaluation Results - LLaVA pretrain (LCS-558k): models **pretrained with refined dataset** and fine-tuned with the original instruct dataset outperforms the baseline (LLaVA-1.5-13B) on 10 out of 12 benchmarks. diff --git a/configs/data_juicer_recipes/README_ZH.md b/configs/data_juicer_recipes/README_ZH.md index 855e433d6..84a195863 100644 --- a/configs/data_juicer_recipes/README_ZH.md +++ b/configs/data_juicer_recipes/README_ZH.md @@ -41,6 +41,7 @@ | 数据子集 | 完善前的样本数目 | 完善后的样本数目 | 样本保留率 | 配置链接 | 数据链接 | 来源 | |---------------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| | LLaVA pretrain (LCS-558k) | 558,128 | 500,380 | 89.65% | [llava-pretrain-refine.yaml](llava-pretrain-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/LLaVA-1.5/public/llava-pretrain-refine-result.json)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/llava-pretrain-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/llava-pretrain-refined-by-data-juicer) | [LLaVA-1.5](https://github.com/haotian-liu/LLaVA) | +| Data-Juicer-T2V | 1,217,346 | 147,176 | 12.09% | [2_multi_op_pipline.yaml](../demo/bench/2_multi_op_pipline.yaml) | [Aliyun](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_optimal_data_pool.zip)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/data-juicer-t2v-optimal-data-pool)
[HuggingFace](https://huggingface.co/datasets/datajuicer/data-juicer-t2v-optimal-data-pool) | [InternVid (606k)](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid)
[Panda-70M (605k)](https://github.com/snap-research/Panda-70M)
[MSR-VTT (6k)](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/) | ### 评测结果 - LLaVA pretrain (LCS-558k): 使用**完善后的预训练数据集**预训练并使用原始的指令数据集微调后的模型在12个评测集上有10个超过了基线模型LLaVA-1.5-13B。 @@ -57,4 +58,4 @@ - 仅视频:根据视频性质提高数据集质量 - 文本-视频:根据文本和视频间的对齐提高数据集质量 用户可以基于这个菜谱开始他们的视频数据集处理流程。 -- \ No newline at end of file +- diff --git a/configs/demo/bench/1_single_op_pipline.yaml b/configs/demo/bench/1_single_op_pipline.yaml new file mode 100644 index 000000000..723a2204d --- /dev/null +++ b/configs/demo/bench/1_single_op_pipline.yaml @@ -0,0 +1,68 @@ +# Sandbox config example + +# global parameters +project_name: 'demo-bench' +experiment_name: 'single_op_language_score' # for wandb tracer name +work_dir: './outputs/demo-bench' # the default output dir for meta logging + +# configs for each job, the jobs will be executed according to the order in the list +probe_job_configs: + # get statistics value for each sample and get the distribution analysis for given percentiles + - hook: 'ProbeViaAnalyzerHook' + meta_name: 'analysis_ori_data' + dj_configs: + project_name: 'demo-bench' + dataset_path: './demos/data/demo-dataset-videos.jsonl' # path to your dataset directory or file + percentiles: [0.333, 0.667] # percentiles to analyze the dataset distribution + export_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl' + export_original_dataset: true # must be true to keep statistics values with dataset + process: + - language_id_score_filter: + lang: 'zh' + min_score: 0.8 + extra_configs: + +refine_recipe_job_configs: + +execution_job_configs: + # sample the splits with low/middle/high statistics values + - hook: 'ProcessDataHook' + meta_name: + dj_configs: + project_name: 'demo-bench' + dataset_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl' # output dataset of probe jobs + export_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl' + process: + - range_specified_field_selector: + field_key: '__dj__stats__.lang_score' # '__dj__stats__' the target keys corresponding to multi-level field information need to be separated by '.'. 'dj__stats' is the default location for storing stats in Data Juicer, and 'lang_score' is the stats corresponding to the language_id_score_filter. + lower_percentile: 0.667 + upper_percentile: 1.000 + extra_configs: + # random sample dataset with fix number of instances + - hook: 'ProcessDataHook' + meta_name: + dj_configs: + project_name: 'demo-bench' + dataset_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl' # output dataset of probe jobs + export_path: './outputs/demo-bench/demo-dataset-for-train.jsonl' + process: + - random_selector: + select_num: 16 + extra_configs: + # train model + - hook: 'TrainModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_train.yaml' + # infer model + - hook: 'InferModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_infer.yaml' + +evaluation_job_configs: + # vbench evaluation + - hook: 'EvaluateDataHook' + meta_name: 'vbench_eval' + dj_configs: + extra_configs: './configs/demo/bench/vbench_eval.yaml' diff --git a/configs/demo/bench/2_multi_op_pipline.yaml b/configs/demo/bench/2_multi_op_pipline.yaml new file mode 100644 index 000000000..d4961e91a --- /dev/null +++ b/configs/demo/bench/2_multi_op_pipline.yaml @@ -0,0 +1,58 @@ +# Sandbox config example + +# global parameters +project_name: 'demo-bench' +experiment_name: 'single_op_language_score' # for wandb tracer name +work_dir: './outputs/demo-bench' # the default output dir for meta logging + +# configs for each job, the jobs will be executed according to the order in the list +probe_job_configs: + +refine_recipe_job_configs: + +execution_job_configs: + - hook: 'ProcessDataHook' + meta_name: + dj_configs: + project_name: 'demo-bench' + dataset_path: './demos/data/demo-dataset-videos.jsonl' # path to your dataset directory or file + export_path: './outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl' + export_original_dataset: true # must be true to keep statistics values with dataset + process: + # select samples with high language score + - language_id_score_filter: + lang: + min_score: 0.7206037306785583 # this value can be observed in the analysis result of the probe job in one op experiments + # select samples with middle video duration + - video_duration_filter: + min_duration: 19.315000 # this value can be observed in the analysis result of the probe job in one op experiments + max_duration: 32.045000 # this value can be observed in the analysis result of the probe job in one op experiments + + extra_configs: + - hook: 'ProcessDataHook' + meta_name: + dj_configs: + project_name: 'demo-bench' + dataset_path: './outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl' + export_path: './outputs/demo-bench/demo-dataset-for-train.jsonl' + process: + - random_selector: + select_num: 16 + extra_configs: + # train model + - hook: 'TrainModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_train.yaml' + # infer model + - hook: 'InferModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_infer.yaml' + +evaluation_job_configs: + # vbench evaluation + - hook: 'EvaluateDataHook' + meta_name: 'vbench_eval' + dj_configs: + extra_configs: './configs/demo/bench/vbench_eval.yaml' diff --git a/configs/demo/bench/3_duplicate_pipline.yaml b/configs/demo/bench/3_duplicate_pipline.yaml new file mode 100644 index 000000000..caf1bb998 --- /dev/null +++ b/configs/demo/bench/3_duplicate_pipline.yaml @@ -0,0 +1,30 @@ +# Sandbox config example + +# global parameters +project_name: 'demo-bench' +experiment_name: 'single_op_language_score' # for wandb tracer name +work_dir: './outputs/demo-bench' # the default output dir for meta logging + +# configs for each job, the jobs will be executed according to the order in the list +probe_job_configs: + +refine_recipe_job_configs: + +execution_job_configs: + # train model + - hook: 'TrainModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_train_2_epoch.yaml' # the input data is set to be demo-dataset-with-multi-op-stats.jsonl + # infer model + - hook: 'InferModelHook' + meta_name: + dj_configs: + extra_configs: './configs/demo/bench/model_infer.yaml' + +evaluation_job_configs: + # vbench evaluation + - hook: 'EvaluateDataHook' + meta_name: 'vbench_eval' + dj_configs: + extra_configs: './configs/demo/bench/vbench_eval.yaml' diff --git a/configs/demo/bench/model_infer.yaml b/configs/demo/bench/model_infer.yaml new file mode 100644 index 000000000..f105f1e42 --- /dev/null +++ b/configs/demo/bench/model_infer.yaml @@ -0,0 +1,25 @@ +type: easyanimate +model_name: "easyanimate" +infer_name: "easyanimate-lora-generate" +train: + model_path: + # path to the pixart model or the hugging face model + pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" + # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. + # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors + transformer_path: "/PATH/TO/EASYANIMATE_MODEL" + # Note that the root path is in "thirdparth/easy_animate" + lora_path: "../../../outputs/demo-bench/models/checkpoint-2.safetensors" + + infer_config: + # must match the pretrained easyanimate checkpoint. + image_size: 256 + prompt_info_path: "../../../tools/mm_eval/vbench_metrics/VBench_mini_info.json" # Use VBench_full_info.json for full eval. + gpu_num: 1 + batch_size: 8 + mixed_precision: "bf16" + video_num_per_prompt: 5 + seed: 43 + + saving_config: + output_video_dir: "../../../outputs/demo-bench/generated_videos" diff --git a/configs/demo/bench/model_train.yaml b/configs/demo/bench/model_train.yaml new file mode 100644 index 000000000..5cc17bd9b --- /dev/null +++ b/configs/demo/bench/model_train.yaml @@ -0,0 +1,31 @@ +type: easyanimate +model_name: "easyanimate" +trainer_name: "easyanimate-lora-trainer" +train: + tracker_config: + # config for wandb + project_name: "demo-bench" + experiment_name: 'demo-single-op-model-train' + model_path: + # path to the pixart model or the hugging face model + pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" + # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. + # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors + transformer_path: "/PATH/TO/EASYANIMATE_MODEL" + dataset_path: + # The root diretory to videos. Set empty if it is the absolute path in the dataset. + dataset_name: "" + # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate" + dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl" + training_config: + # image size, must match the pretrained easyanimate checkpoint. + sample_size: 256 + mixed_precision: "bf16" + batch_size_per_gpu: 8 + gradient_accumulation_steps: 1 + num_train_epochs: 1 + dataloader_num_workers: 8 + seed: 42 + saving_config: + # Note that the root path is in "thirdparth/models/EasyAnimate" + output_dir: "../../../outputs/demo-bench/models" diff --git a/configs/demo/bench/model_train_2_epoch.yaml b/configs/demo/bench/model_train_2_epoch.yaml new file mode 100644 index 000000000..d91a65204 --- /dev/null +++ b/configs/demo/bench/model_train_2_epoch.yaml @@ -0,0 +1,31 @@ +type: easyanimate +model_name: "easyanimate" +trainer_name: "easyanimate-lora-trainer" +train: + tracker_config: + # config for wandb + project_name: "demo-bench" + experiment_name: 'demo-single-op-model-train' + model_path: + # path to the pixart model or the hugging face model + pretrained_model_name_or_path: "PixArt-alpha/PixArt-XL-2-512x512" + # path to pretrained easyanimate checkpoint. Following are the links to available checkpoints. + # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors + transformer_path: "/PATH/TO/EASYANIMATE_MODEL" + dataset_path: + # The root diretory to videos. Set empty if it is the absolute path in the dataset. + dataset_name: "" + # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate" + dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl" + training_config: + # image size, must match the pretrained easyanimate checkpoint. + sample_size: 256 + mixed_precision: "bf16" + batch_size_per_gpu: 8 + gradient_accumulation_steps: 1 + num_train_epochs: 2 + dataloader_num_workers: 8 + seed: 42 + saving_config: + # Note that the root path is in "thirdparth/easy_animate" + output_dir: "../../../outputs/demo-bench/models" diff --git a/configs/demo/sandbox/vbench_eval_config.yaml b/configs/demo/bench/vbench_eval.yaml similarity index 63% rename from configs/demo/sandbox/vbench_eval_config.yaml rename to configs/demo/bench/vbench_eval.yaml index 1e4989ca7..a2f067868 100644 --- a/configs/demo/sandbox/vbench_eval_config.yaml +++ b/configs/demo/bench/vbench_eval.yaml @@ -1,16 +1,16 @@ type: vbench_video_evaluator -# The vbench prompts for video generation. -prompt_path: ./tools/mm_eval/vbench_metrics/VBench_full_info.json +# The vbench prompts for video generation. Use VBench_full_info.json for full eval. +prompt_path: ./tools/mm_eval/vbench_metrics/VBench_mini_info.json # The path to the dir of generated videos -videos_path: /path/to/the/generated/videos +videos_path: ./outputs/demo-bench/generated_videos # The dir to store the eval results -result_dir: ./outputs/demo-sandbox/vbench_eval_results +result_dir: ./outputs/demo-bench/eval_results # Give a name for this eval -eval_name: +eval_name: mini_test # If true, load the required model for VBench from the cache path of evironment parameter VBENCH_CACHE_DIR load_ckpt_from_local: false @@ -20,8 +20,6 @@ load_ckpt_from_local: false # 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality', 'object_class', # 'multiple_objects', 'human_action', 'color', 'spatial_relationship', 'scene', 'temporal_style', # 'appearance_style', 'overall_consistency'] -# NOTE: Current version of vbench in pypi lacks of a third party code for motion_smoothness. -# NOTE: Besides, when len(dimension_list) > 1, it would occur an error in video loading. dimension_list: - subject_consistency - dynamic_degree diff --git a/configs/demo/sandbox/sandbox.yaml b/configs/demo/sandbox/sandbox.yaml index a250b4ed4..9654b9552 100644 --- a/configs/demo/sandbox/sandbox.yaml +++ b/configs/demo/sandbox/sandbox.yaml @@ -3,6 +3,7 @@ # global parameters project_name: 'demo-sandbox' experiment_name: 'demo-sandbox-run0' # for wandb tracer name +work_dir: './outputs/demo-sandbox' # the default output dir for meta logging hpo_config: null # path to a configuration file when using auto-HPO tool. # configs for each job, the jobs will be executed according to the order in the list diff --git a/data_juicer/core/sandbox/evaluators.py b/data_juicer/core/sandbox/evaluators.py index fdf735732..750017669 100644 --- a/data_juicer/core/sandbox/evaluators.py +++ b/data_juicer/core/sandbox/evaluators.py @@ -19,7 +19,7 @@ class BaseEvaluator(object): def __init__(self, eval_config: dict): self.eval_config = eval_config - def run(self, eval_type, eval_obj, **kwargs) -> dict: + def run(self, eval_type, eval_obj=None, **kwargs) -> dict: """ conduct the evaluation given specified measurement on specified target object; @@ -30,11 +30,9 @@ def run(self, eval_type, eval_obj, **kwargs) -> dict: class Gpt3QualityEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): if eval_type == 'data': - # eval_obj is the path to the dataset to be evaluated - assert isinstance(eval_obj, str) - input_data_path = eval_obj + input_data_path = self.eval_config.dataset_path tmp_res_export_path = input_data_path + '.tmp_res.jsonl' if os.path.exists(tmp_res_export_path): if os.path.isfile(tmp_res_export_path): @@ -58,7 +56,7 @@ def run(self, eval_type, eval_obj, **kwargs): class InceptionEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): if eval_type == 'data': result_dict = calc_metrics( fake_data_path=self.eval_config.fake_data_path, @@ -83,26 +81,26 @@ def run(self, eval_type, eval_obj, **kwargs): class HelmEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError("To be refactored from dj's `thirdparty`.") class GptEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError('To be refactored from `tools.evaluator`,') class VideoFvdEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError( 'To be refactored from video fvd/isv related tools.') class Gpt4VEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError( 'To be refactored from gpt4v related operators/tools.') @@ -113,7 +111,7 @@ def get_score(self, result_path, dimension): cur_result = json.load(open(result_path)) return cur_result[dimension][0] - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): if eval_type == 'data': prompt_path = self.eval_config.prompt_path videos_path = self.eval_config.videos_path @@ -153,13 +151,13 @@ def run(self, eval_type, eval_obj, **kwargs): class LmHarnessEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError( 'To be refactored from, used in data-juicer competition.') class ModelscopeEvaluator(BaseEvaluator): - def run(self, eval_type, eval_obj, **kwargs): + def run(self, eval_type, eval_obj=None, **kwargs): raise NotImplementedError( 'To be implemented from https://github.com/modelscope/eval-scope.') diff --git a/data_juicer/core/sandbox/factories.py b/data_juicer/core/sandbox/factories.py index 1fda9f2b9..7e3b9d35f 100644 --- a/data_juicer/core/sandbox/factories.py +++ b/data_juicer/core/sandbox/factories.py @@ -1,8 +1,41 @@ +from data_juicer.core import Analyzer +from data_juicer.core import Executor as DjExecutor from data_juicer.core.sandbox.evaluators import (Gpt3QualityEvaluator, InceptionEvaluator, VBenchEvaluator) -from data_juicer.core.sandbox.model_executors import (ModelscopeInferExecutor, - ModelscopeTrainExecutor) +from data_juicer.core.sandbox.model_executors import ( + EasyAnimateInferExecutor, EasyAnimateTrainExecutor, + ModelscopeInferProbeExecutor, ModelscopeTrainExecutor) + + +class DataExecutorFactory(object): + """ + Factory for Data-Juicer executor. + """ + + def __call__(self, dj_cfg: dict = None, *args, **kwargs): + if dj_cfg is None: + return None + + return DjExecutor(dj_cfg) + + +data_executor_factory = DataExecutorFactory() + + +class DataAnalyzerFactory(object): + """ + Factory for Data-Juicer analyzer. + """ + + def __call__(self, dj_cfg: dict = None, *args, **kwargs): + if dj_cfg is None: + return None + + return Analyzer(dj_cfg) + + +data_analyzer_factory = DataAnalyzerFactory() class DataEvaluatorFactory(object): @@ -47,19 +80,19 @@ def __call__(self, eval_cfg: dict = None, *args, **kwargs): model_evaluator_factory = ModelEvaluatorFactory() -class ModelInferExecutorFactory(object): +class ModelInferEvaluatorFactory(object): def __call__(self, model_cfg: dict = None, *args, **kwargs): if model_cfg is None: return None if model_cfg.type == 'modelscope': - return ModelscopeInferExecutor(model_cfg) + return ModelscopeInferProbeExecutor(model_cfg) # add more model inference here freely -mode_infer_executor_factory = ModelInferExecutorFactory() +mode_infer_evaluator_factory = ModelInferEvaluatorFactory() class ModelTrainExecutorFactory(object): @@ -70,8 +103,25 @@ def __call__(self, model_cfg: dict = None, *args, **kwargs): if model_cfg.type == 'modelscope': return ModelscopeTrainExecutor(model_cfg, **kwargs) + elif model_cfg.type == 'easyanimate': + return EasyAnimateTrainExecutor(model_cfg, **kwargs) # add more model trainer here freely model_train_executor_factory = ModelTrainExecutorFactory() + + +class ModelInferExecutorFactory(object): + + def __call__(self, generate_cfg: dict = None, *args, **kwargs): + if generate_cfg is None: + return None + + if generate_cfg.type == 'easyanimate': + return EasyAnimateInferExecutor(generate_cfg, **kwargs) + + # add more data generation here freely + + +model_infer_executor_factory = ModelInferExecutorFactory() diff --git a/data_juicer/core/sandbox/hooks.py b/data_juicer/core/sandbox/hooks.py index d3ac97ea9..cf16376a9 100644 --- a/data_juicer/core/sandbox/hooks.py +++ b/data_juicer/core/sandbox/hooks.py @@ -1,15 +1,16 @@ +# yapf: disable import asyncio -import os from jsonargparse import dict_to_namespace from loguru import logger from data_juicer.config import get_init_configs, prepare_side_configs -from data_juicer.core import Analyzer -from data_juicer.core import Executor as DjExecutor -from data_juicer.core.sandbox.factories import (data_evaluator_factory, - mode_infer_executor_factory, +from data_juicer.core.sandbox.factories import (data_analyzer_factory, + data_evaluator_factory, + data_executor_factory, + mode_infer_evaluator_factory, model_evaluator_factory, + model_infer_executor_factory, model_train_executor_factory) from data_juicer.utils.constant import JobRequiredKeys from tools.hpo.execute_hpo_3sigma import modify_recipe_k_sigma @@ -57,7 +58,7 @@ def __init__(self, job_cfg, watcher, *args, **kwargs): def hook(self, **kwargs): self.specify_dj_and_extra_configs() - analyzer = Analyzer(self.inited_dj_cfg) + analyzer = data_analyzer_factory(self.inited_dj_cfg) # probe the data via Analyzer logger.info('Begin to analyze data') analyzer.run() @@ -85,8 +86,8 @@ def __init__(self, job_cfg, watcher, *args, **kwargs): def hook(self, **kwargs): self.specify_dj_and_extra_configs() - data_executor = DjExecutor(self.inited_dj_cfg) - model_infer_executor = mode_infer_executor_factory(self.other_cfg) + data_executor = data_executor_factory(self.inited_dj_cfg) + model_infer_executor = mode_infer_evaluator_factory(self.other_cfg) # TODO # probe the model (calling inference sub-pipeline) based on # original data, such that we know what is the "hard" data for @@ -163,7 +164,7 @@ def __init__(self, job_cfg, watcher, *args, **kwargs): def hook(self, **kwargs): self.specify_dj_and_extra_configs() - data_executor = DjExecutor(self.inited_dj_cfg) + data_executor = data_executor_factory(self.inited_dj_cfg) # basic routine to process data, users can customize this freely logger.info('Begin to process the data with given dj recipe') data_executor.run() @@ -189,15 +190,30 @@ def hook(self, **kwargs): # users can customize this freely logger.info('Begin to train the model with given model config') # update training dataset path - training_args = { - 'train_dataset': - self.other_cfg.dataset_path, - 'work_dir': - os.path.join(self.other_cfg.work_dir, 'model_trainer_outputs'), - } asyncio.run( - model_trainer.run(model_trainer.model_config['type'], - training_args, **kwargs)) + model_trainer.run(model_trainer.model_config['type'], **kwargs)) + return kwargs + + +class InferModelHook(BaseHook): + + def __init__(self, job_cfg, watcher, *args, **kwargs): + """ + Initialize the hook for model training + + :param job_cfg: the job configs + :param watcher: for watching the result + """ + super(InferModelHook, self).__init__(job_cfg, watcher, *args, **kwargs) + + def hook(self, **kwargs): + self.specify_dj_and_extra_configs() + model_infer = model_infer_executor_factory(self.other_cfg, + watcher=self.watcher) + + logger.info('Begin to infer the model with given model config') + asyncio.run(model_infer.run(model_infer.model_config['type'], + **kwargs)) return kwargs @@ -219,10 +235,7 @@ def hook(self, **kwargs): # basic routine to evaluate the given data, # users can customize this freely logger.info('Begin to evaluate the data with given evaluator config') - processed_dataset = self.other_cfg.dataset_path - eval_res = data_evaluator.run(eval_type='data', - eval_obj=processed_dataset, - **kwargs) + eval_res = data_evaluator.run(eval_type='data', **kwargs) self.watcher.watch(eval_res, self.meta_name) return kwargs @@ -256,6 +269,7 @@ def hook(self, **kwargs): 'RefineRecipeViaModelFeedbackHook': RefineRecipeViaModelFeedbackHook, 'ProcessDataHook': ProcessDataHook, 'TrainModelHook': TrainModelHook, + 'InferModelHook': InferModelHook, 'EvaluateDataHook': EvaluateDataHook, 'EvaluateModelHook': EvaluateModelHook, } diff --git a/data_juicer/core/sandbox/model_executors.py b/data_juicer/core/sandbox/model_executors.py index 571140760..e5088aa1e 100644 --- a/data_juicer/core/sandbox/model_executors.py +++ b/data_juicer/core/sandbox/model_executors.py @@ -1,10 +1,13 @@ import asyncio -import os.path +import os import re +import stat +import subprocess import sys import time -from data_juicer.config.config import namespace_to_dict +from jsonargparse import namespace_to_dict + from data_juicer.utils.file_utils import follow_read @@ -21,7 +24,7 @@ def __init__(self, model_config: dict, watcher=None): self.END_OF_MODEL_EXEC = \ " End of ModelExecutor's running " - async def run(self, run_type, run_obj, **kwargs): + async def run(self, run_type, run_obj=None, **kwargs): """ conduct some model-related execution tasks given specified run_type and run_obj @@ -40,7 +43,7 @@ async def run(self, run_type, run_obj, **kwargs): timestamp = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) log_f_name = os.path.join( - self.watcher.dj_cfg.work_dir, + self.watcher.sandbox_cfg.work_dir, f'model_exe_{run_type}_{timestamp}.log') self.watcher.model_exe_log_file = log_f_name with open(log_f_name, 'w') as log_f: @@ -56,10 +59,15 @@ async def run(self, run_type, run_obj, **kwargs): sys.stderr = original_stderr return summarized_watched_res - async def _run(self, run_type, run_obj, **kwargs): + def run_subprocess(self, script_path, run_args, working_dir, cmd='bash'): + run_args = [str(arg) for arg in run_args] + args = [cmd, script_path] + run_args + subprocess.run(args, cwd=working_dir) + + async def _run(self, run_type, run_obj=None, **kwargs): raise NotImplementedError - async def watch_run(self, run_type, run_obj, **kwargs): + async def watch_run(self, run_type, run_obj=None, **kwargs): """ watch the running process in an online manner, and return the summarized results @@ -125,7 +133,7 @@ def _watch_run(self, line, **kwargs): self.watcher.watch(loss_value, 'loss') -class ModelscopeInferExecutor(ModelScopeExecutor): +class ModelscopeInferProbeExecutor(ModelScopeExecutor): def __init__(self, model_config: dict): super().__init__(model_config) @@ -135,7 +143,7 @@ def __init__(self, model_config: dict): except ModuleNotFoundError: raise ModuleNotFoundError('modelscope package not installed') - async def _run(self, run_type, run_obj, **kwargs): + async def _run(self, run_type, run_obj=None, **kwargs): if run_type == 'infer_on_data': return self.executor(self.data_connector(run_obj), **kwargs) else: @@ -175,7 +183,7 @@ def build_executor(self, except ModuleNotFoundError: raise ModuleNotFoundError('modelscope package not installed') - async def _run(self, run_type, run_obj, **kwargs): + async def _run(self, run_type, run_obj=None, **kwargs): # training cfg updated, such as datasets and training parameters builder_kwargs = { 'model_name': self.model_config['model_name'], @@ -185,28 +193,81 @@ async def _run(self, run_type, run_obj, **kwargs): key_remapping = self.model_config['key_remapping'] else: key_remapping = None - if 'train_dataset' in run_obj: + if 'train_dataset' in self.model_config: builder_kwargs['train_dataset'] = self.data_connector( - run_obj['train_dataset'], + self.model_config['train_dataset'], split='train', key_remapping=key_remapping) - if 'eval_dataset' in run_obj: + if 'eval_dataset' in self.model_config: builder_kwargs['eval_dataset'] = self.data_connector( - run_obj['eval_dataset'], + self.model_config['eval_dataset'], split='val', key_remapping=key_remapping) - if 'work_dir' in run_obj: - builder_kwargs['work_dir'] = run_obj['work_dir'] + if 'work_dir' in self.model_config: + builder_kwargs['work_dir'] = self.model_config['work_dir'] self.work_dir = builder_kwargs['work_dir'] self.build_executor(**builder_kwargs) self.executor.train() -class EasySoraExecutor(BaseModelExecutor): +class EasyAnimateTrainExecutor(BaseModelExecutor): - def __init__(self, model_config: dict): - super().__init__(model_config) - raise NotImplementedError('To be implemented from easysora.') + def __init__(self, model_config: dict, watcher=None): + super().__init__(model_config, watcher) + cur_working_dir = os.getcwd() + self.script_path = os.path.join( + cur_working_dir, 'thirdparty/models/EasyAnimate/train_lora.sh') + self.working_dir = os.path.join(cur_working_dir, + 'thirdparty/models/EasyAnimate/') + # make sure executable + current_permissions = os.stat(self.script_path).st_mode + os.chmod( + self.script_path, + current_permissions | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + + async def _run(self, run_type, run_obj=None, **kwargs): + config = self.model_config.train + run_args = [ + config.model_path.pretrained_model_name_or_path, + config.model_path.transformer_path, + config.dataset_path.dataset_name, + config.dataset_path.dataset_meta_name, + config.training_config.sample_size, + config.training_config.mixed_precision, + config.training_config.batch_size_per_gpu, + config.training_config.gradient_accumulation_steps, + config.training_config.num_train_epochs, + config.training_config.dataloader_num_workers, + config.training_config.seed, config.saving_config.output_dir, + config.tracker_config.project_name, + config.tracker_config.experiment_name + ] + self.run_subprocess(self.script_path, run_args, self.working_dir) + + +class EasyAnimateInferExecutor(BaseModelExecutor): + + def __init__(self, model_config: dict, watcher=None): + super().__init__(model_config, watcher) + cur_working_dir = os.getcwd() + self.script_path = os.path.join( + cur_working_dir, 'thirdparty/models/EasyAnimate/infer_lora.sh') + self.working_dir = os.path.join(cur_working_dir, + './thirdparty/models/EasyAnimate/') + + async def _run(self, run_type, run_obj=None, **kwargs): + config = self.model_config.train + run_args = [ + config.model_path.pretrained_model_name_or_path, + config.model_path.transformer_path, config.model_path.lora_path, + config.infer_config.image_size, + config.infer_config.prompt_info_path, config.infer_config.gpu_num, + config.infer_config.batch_size, + config.infer_config.mixed_precision, + config.infer_config.video_num_per_prompt, config.infer_config.seed, + config.saving_config.output_video_dir + ] + self.run_subprocess(self.script_path, run_args, self.working_dir) class LLaVAExecutor(BaseModelExecutor): diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 3d030b170..9c5acd5c4 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -155,23 +155,27 @@ def compute_stats(self, sample, rank=None, context=False): sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - # compute aesthetics_scores - model, processor = get_model(self.model_key, rank, self.use_cuda()) - inputs = processor(images=frame_images, - return_tensors='pt').to(model.device) - with torch.no_grad(): - outputs = model(**inputs) - if self.need_normalized_by_ten: - aesthetics_score = outputs.logits / 10.0 - else: - aesthetics_score = outputs.logits + if len(frame_images) > 0: + # compute aesthetics_scores + model, processor = get_model(self.model_key, rank=rank) + inputs = processor(images=frame_images, + return_tensors='pt').to(model.device) + with torch.no_grad(): + outputs = model(**inputs) + if self.need_normalized_by_ten: + aesthetics_score = outputs.logits / 10.0 + else: + aesthetics_score = outputs.logits - if self.reduce_mode == 'avg': - aesthetics_score = float(aesthetics_score.mean()) - elif self.reduce_mode == 'max': - aesthetics_score = float(aesthetics_score.max()) + if self.reduce_mode == 'avg': + aesthetics_score = float(aesthetics_score.mean()) + elif self.reduce_mode == 'max': + aesthetics_score = float(aesthetics_score.max()) + else: + aesthetics_score = float(aesthetics_score.min()) else: - aesthetics_score = float(aesthetics_score.min()) + aesthetics_score = 0.0 + aesthetics_scores.append(aesthetics_score) logger.debug(f'aesthetics_score: {aesthetics_scores}') diff --git a/data_juicer/ops/filter/video_duration_filter.py b/data_juicer/ops/filter/video_duration_filter.py index 9d9653332..a224e0dd0 100644 --- a/data_juicer/ops/filter/video_duration_filter.py +++ b/data_juicer/ops/filter/video_duration_filter.py @@ -1,7 +1,7 @@ import sys import numpy as np -from jsonargparse.typing import NonNegativeInt +from jsonargparse.typing import NonNegativeFloat from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, load_data_with_context, @@ -20,8 +20,8 @@ class VideoDurationFilter(Filter): """ def __init__(self, - min_duration: NonNegativeInt = 0, - max_duration: NonNegativeInt = sys.maxsize, + min_duration: NonNegativeFloat = 0, + max_duration: NonNegativeFloat = sys.maxsize, any_or_all: str = 'any', *args, **kwargs): diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index 58600eeac..a127b44de 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -171,23 +171,26 @@ def compute_stats(self, sample, rank=None, context=False): image = ImageOps.flip(image) video_frame_images_chunk.append(image) - inputs = processor(text=text_chunk, - images=video_frame_images_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True).to(model.device) - - outputs = model(**inputs) - chunk_logits = outputs.logits_per_text / 100.0 - - if self.reduce_mode == 'avg': - chunk_similarity = chunk_logits.mean() - elif self.reduce_mode == 'max': - chunk_similarity = chunk_logits.max() + if len(video_frame_images_chunk) > 0: + inputs = processor(text=text_chunk, + images=video_frame_images_chunk, + return_tensors='pt', + truncation=True, + max_length=model.config.text_config. + max_position_embeddings, + padding=True).to(model.device) + + outputs = model(**inputs) + chunk_logits = outputs.logits_per_text / 100.0 + + if self.reduce_mode == 'avg': + chunk_similarity = chunk_logits.mean() + elif self.reduce_mode == 'max': + chunk_similarity = chunk_logits.max() + else: + chunk_similarity = chunk_logits.min() else: - chunk_similarity = chunk_logits.min() + chunk_similarity = 0.0 similarity.append(float(chunk_similarity)) offset += count diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index c392b44d1..4b45a468f 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -135,21 +135,26 @@ def compute_stats(self, sample, rank=None, context=False): sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_scores = [ - scores[1] for scores in torch.softmax(logits, dim=-1) - ] - cur_scores = torch.Tensor(cur_scores) - - if self.reduce_mode == 'avg': - cur_score = cur_scores.mean() - elif self.reduce_mode == 'max': - cur_score = cur_scores.max() + + if len(frame_images) > 0: + inputs = processor(images=frame_images, return_tensors='pt') + inputs = inputs.to(model.device) + outputs = model(**inputs) + logits = outputs.logits + cur_scores = [ + scores[1] for scores in torch.softmax(logits, dim=-1) + ] + cur_scores = torch.Tensor(cur_scores) + + if self.reduce_mode == 'avg': + cur_score = cur_scores.mean() + elif self.reduce_mode == 'max': + cur_score = cur_scores.max() + else: + cur_score = cur_scores.min() else: - cur_score = cur_scores.min() + cur_score = 0.0 + nsfw_scores.append(float(cur_score)) sample[Fields.stats][StatsKeys.video_nsfw_score] = nsfw_scores diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 2deee0eaf..7b5b5f264 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -138,19 +138,25 @@ def compute_stats(self, sample, rank=None, context=False): sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_probs = [probs[1] for probs in torch.softmax(logits, dim=-1)] - cur_probs = torch.Tensor(cur_probs) - - if self.reduce_mode == 'avg': - cur_prob = cur_probs.mean() - elif self.reduce_mode == 'max': - cur_prob = cur_probs.max() + + if len(frame_images) > 0: + inputs = processor(images=frame_images, return_tensors='pt') + inputs = inputs.to(model.device) + outputs = model(**inputs) + logits = outputs.logits + cur_probs = [ + probs[1] for probs in torch.softmax(logits, dim=-1) + ] + cur_probs = torch.Tensor(cur_probs) + + if self.reduce_mode == 'avg': + cur_prob = cur_probs.mean() + elif self.reduce_mode == 'max': + cur_prob = cur_probs.max() + else: + cur_prob = cur_probs.min() else: - cur_prob = cur_probs.min() + cur_prob = 0.0 watermark_probs.append(float(cur_prob)) sample[Fields.stats][StatsKeys.video_watermark_prob] = watermark_probs diff --git a/data_juicer/ops/selector/__init__.py b/data_juicer/ops/selector/__init__.py index c37998a9a..a90f6db8e 100644 --- a/data_juicer/ops/selector/__init__.py +++ b/data_juicer/ops/selector/__init__.py @@ -1,5 +1,11 @@ -from . import frequency_specified_field_selector, topk_specified_field_selector +from . import (frequency_specified_field_selector, random_selector, + range_specified_field_selector, topk_specified_field_selector) from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector +from .random_selector import RandomSelector +from .range_specified_field_selector import RangeSpecifiedFieldSelector from .topk_specified_field_selector import TopkSpecifiedFieldSelector -__all__ = ['FrequencySpecifiedFieldSelector', 'TopkSpecifiedFieldSelector'] +__all__ = [ + 'FrequencySpecifiedFieldSelector', 'RandomSelector', + 'RangeSpecifiedFieldSelector', 'TopkSpecifiedFieldSelector' +] diff --git a/data_juicer/ops/selector/random_selector.py b/data_juicer/ops/selector/random_selector.py new file mode 100644 index 000000000..19724d29d --- /dev/null +++ b/data_juicer/ops/selector/random_selector.py @@ -0,0 +1,49 @@ +from jsonargparse.typing import ClosedUnitInterval, PositiveInt + +from data_juicer.format.mixture_formatter import MixtureFormatter + +from ..base_op import OPERATORS, Selector + + +@OPERATORS.register_module('random_selector') +class RandomSelector(Selector): + """Selector to random select samples. """ + + def __init__(self, + select_ratio: ClosedUnitInterval = None, + select_num: PositiveInt = None, + *args, + **kwargs): + """ + Initialization method. + + :param select_ratio: The ratio to select. When both + select_ratio and select_num are set, the value corresponding + to the smaller number of samples will be applied. + :param select_num: The number of samples to select. When both + select_ratio and select_num are set, the value corresponding + to the smaller number of samples will be applied. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.select_ratio = select_ratio + self.select_num = select_num + + def process(self, dataset): + if len(dataset) <= 1: + return dataset + + if self.select_ratio is None and self.select_num is None: + return dataset + + select_num = 0 + if not self.select_ratio: + select_num = self.select_num + else: + select_num = int(self.select_ratio * len(dataset)) + if self.select_num and self.select_num < select_num: + select_num = self.select_num + + return MixtureFormatter.random_sample(dataset, + sample_number=select_num) diff --git a/data_juicer/ops/selector/range_specified_field_selector.py b/data_juicer/ops/selector/range_specified_field_selector.py new file mode 100644 index 000000000..f2e9f12c6 --- /dev/null +++ b/data_juicer/ops/selector/range_specified_field_selector.py @@ -0,0 +1,109 @@ +import heapq + +from jsonargparse.typing import ClosedUnitInterval, PositiveInt + +from data_juicer.utils.common_utils import stats_to_number + +from ..base_op import OPERATORS, Selector + + +@OPERATORS.register_module('range_specified_field_selector') +class RangeSpecifiedFieldSelector(Selector): + """Selector to select a range of samples based on the sorted + specified field value from smallest to largest. """ + + def __init__(self, + field_key: str = '', + lower_percentile: ClosedUnitInterval = None, + upper_percentile: ClosedUnitInterval = None, + lower_rank: PositiveInt = None, + upper_rank: PositiveInt = None, + *args, + **kwargs): + """ + Initialization method. + + :param field_key: Selector based on the specified value + corresponding to the target key. The target key + corresponding to multi-level field information need to be + separated by '.'. + :param lower_percentile: The lower bound of the percentile to + be sample, samples will be selected if their specified field + values are greater than this lower bound. When both + lower_percentile and lower_rank are set, the value corresponding + to the larger number of samples will be applied. + :param upper_percentile: The upper bound of the percentile to + be sample, samples will be selected if their specified field + values are less or equal to the upper bound. When both + upper_percentile and upper_rank are set, the value corresponding + to the smaller number of samples will be applied. + :param lower_rank: The lower bound of the rank to be sample, + samples will be selected if their specified field values are + greater than this lower bound. When both lower_percentile and + lower_rank are set, the value corresponding to the larger number + of samples will be applied. + :param upper_rank: The upper bound of the rank to be sample, + samples will be selected if their specified field values are + less or equal to the upper bound. When both upper_percentile and + upper_rank are set, the value corresponding to the smaller number + of samples will be applied. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.field_key = field_key + self.lower_percentile = lower_percentile + self.upper_percentile = upper_percentile + self.lower_rank = lower_rank + self.upper_rank = upper_rank + + def process(self, dataset): + if len(dataset) <= 1 or not self.field_key: + return dataset + + if self.lower_percentile is None and self.lower_rank is None: + return dataset + if self.upper_percentile is None and self.upper_rank is None: + return dataset + + lower_bound, upper_bound = 0, len(dataset) + if self.lower_percentile is not None: + lower_bound = int(self.lower_percentile * len(dataset)) + if self.lower_rank is not None: + lower_bound = max(lower_bound, self.lower_rank) + if self.upper_percentile is not None: + upper_bound = int(self.upper_percentile * len(dataset)) + if self.upper_rank is not None: + upper_bound = min(upper_bound, self.upper_rank) + upper_bound = max(lower_bound, upper_bound) + + field_keys = self.field_key.split('.') + assert field_keys[0] in dataset.features.keys( + ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) + + def get_field_value_list(cur_dataset, field_keys): + if len(field_keys) == 1: + field_value_list = cur_dataset[field_keys[0]] + else: + field_value_list = [] + for item in cur_dataset[field_keys[0]]: + field_value = item + for key in field_keys[1:]: + assert key in field_value.keys( + ), "'{}' not in {}".format(key, field_value.keys()) + field_value = field_value[key] + field_value_list.append(field_value) + field_value_list = [stats_to_number(s) for s in field_value_list] + return field_value_list + + field_value_list = get_field_value_list(dataset, field_keys) + select_index = heapq.nsmallest(int(upper_bound), range(len(dataset)), + field_value_list.__getitem__) + sub_dataset = dataset.select(select_index) + + field_value_list = get_field_value_list(sub_dataset, field_keys) + select_index = heapq.nlargest(int(upper_bound - lower_bound), + range(len(sub_dataset)), + field_value_list.__getitem__) + + return sub_dataset.select(select_index) diff --git a/data_juicer/ops/selector/topk_specified_field_selector.py b/data_juicer/ops/selector/topk_specified_field_selector.py index b2ae275d6..573b2e09f 100644 --- a/data_juicer/ops/selector/topk_specified_field_selector.py +++ b/data_juicer/ops/selector/topk_specified_field_selector.py @@ -1,19 +1,10 @@ import heapq -import sys from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from ..base_op import OPERATORS, Selector - +from data_juicer.utils.common_utils import stats_to_number -def to_number(s, reverse=True): - try: - return float(s) - except Exception: - if reverse: - return -sys.maxsize - else: - return sys.maxsize +from ..base_op import OPERATORS, Selector @OPERATORS.register_module('topk_specified_field_selector') @@ -85,7 +76,8 @@ def process(self, dataset): assert key in field_value.keys(), "'{}' not in {}".format( key, field_value.keys()) field_value = field_value[key] - field_value_list.append(to_number(field_value, self.reverse)) + field_value_list.append( + stats_to_number(field_value, self.reverse)) if self.reverse: select_index = heapq.nlargest(int(select_num), range(len(dataset)), diff --git a/data_juicer/utils/common_utils.py b/data_juicer/utils/common_utils.py new file mode 100644 index 000000000..5bd336b9b --- /dev/null +++ b/data_juicer/utils/common_utils.py @@ -0,0 +1,21 @@ +import sys + +import numpy as np + + +def stats_to_number(s, reverse=True): + ''' + convert a stats value which can be string + of list to a float. + ''' + try: + if isinstance(s, str): + return float(s) + if s is None or s == []: + raise ValueError('empty value') + return float(np.asarray(s).mean()) + except Exception: + if reverse: + return -sys.maxsize + else: + return sys.maxsize diff --git a/demos/auto_evaluation_helm/README_ZH.md b/demos/auto_evaluation_helm/README_ZH.md index 2a30e1cbe..fe5950a91 100644 --- a/demos/auto_evaluation_helm/README_ZH.md +++ b/demos/auto_evaluation_helm/README_ZH.md @@ -32,7 +32,7 @@ docker run --gpus all --ipc=host --ulimit memlock=-1 -it --rm -v /dataset:/works dokcer 容器成功运行后在容器内运行安装脚本并登录 wandb: ```shell -cd /workspace/data-juicer/thirdparty +cd /workspace/data-juicer/thirdparty/LLM_ecosystems ./setup_megatron.sh ./setup_helm.sh wandb login @@ -49,7 +49,7 @@ docker commit data-juicer-eval 进入 Megatron-LM 目录并执行数据预处理脚本,该脚本会将 data-juicer 处理好的 jsonline(假设路径为 `/workspace/data/dataset.jsonl`)文件转化为二进制格式,并保存为 `/workspace.data/dataset_text_document.bin` 和 `/workspace.data/dataset_text_document.idx` 两个文件。 ```shell -cd /workspace/data-juicer/thirdparty/Megatron-LM +cd /workspace/data-juicer/thirdparty/LLM_ecosystems/Megatron-LM python tools/preprocess_data.py \ --input /workspace/data/dataset.jsonl \ --output-prefix dataset \ @@ -65,7 +65,7 @@ python tools/preprocess_data.py \ 进入 Megatron-LM 目录并执行如下指令 ```shell -cd /workspace/data-juicer/thirdparty/Megatron-LM +cd /workspace/data-juicer/thirdparty/LLM_ecosystems/Megatron-LM nohup bash /workspace/data-juicer/demos/auto_eval_helm/pretrain_example.sh > train.log 2>&1 & ``` diff --git a/demos/data/demo-dataset-videos.jsonl b/demos/data/demo-dataset-videos.jsonl new file mode 100644 index 000000000..81a417030 --- /dev/null +++ b/demos/data/demo-dataset-videos.jsonl @@ -0,0 +1,6 @@ +{"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> a cartoon"} +{"videos":["../../tests/ops/data/video1.mp4"], "text": "<__dj__video> 一段卡通"} +{"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> a man"} +{"videos":["../../tests/ops/data/video2.mp4"], "text": "<__dj__video> 一个男人"} +{"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> two women"} +{"videos":["../../tests/ops/data/video3.mp4"], "text": "<__dj__video> 两个女人"} diff --git a/docs/Operators.md b/docs/Operators.md index 6bc3599ba..a35210161 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -14,7 +14,7 @@ The operators in Data-Juicer are categorized into 5 types. | [ Mapper ]( #mapper ) | 43 | Edits and transforms samples | | [ Filter ]( #filter ) | 41 | Filters out low-quality samples | | [ Deduplicator ]( #deduplicator ) | 5 | Detects and removes duplicate samples | -| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking | +| [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | All the specific operators are listed below, each featured with several capability tags. @@ -160,6 +160,8 @@ All the specific operators are listed below, each featured with several capabili | Operator | Domain | Lang | Description | |------------------------------------|---------|--------|-----------------------------------------------------------------------| | frequency_specified_field_selector | General | en, zh | Selects top samples by comparing the frequency of the specified field | +| random_selector | General | en, zh | Selects samples randomly | +| range_specified_field_selector | General | en, zh | Selects samples within a specified range by comparing the values of the specified field | | topk_specified_field_selector | General | en, zh | Selects top samples by comparing the values of the specified field | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index 3ee94d381..855d109a7 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -14,7 +14,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | [ Mapper ]( #mapper ) | 43 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 41 | 过滤低质量样本 | | [ Deduplicator ]( #deduplicator ) | 5 | 识别、删除重复样本 | -| [ Selector ]( #selector ) | 2 | 基于排序选取高质量样本 | +| [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | 下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。 @@ -157,6 +157,8 @@ Data-Juicer 中的算子分为以下 5 种类型。 | 算子 | 场景 | 语言 | 描述 | |-------------------------------------|----------|---------|------------------------------------------------| | frequency_specified_field_selector | General | en, zh | 通过比较指定字段的频率选出前 k 个样本 | +| random_selector | General | en, zh | 随机筛选 k 个样本 | +| range_specified_field_selector | General | en, zh | 通过比较指定字段的值选出指定范围的 k 个样本 | | topk_specified_field_selector | General | en, zh | 通过比较指定字段的值选出前 k 个样本 | ## 贡献 diff --git a/docs/Sandbox-ZH.md b/docs/Sandbox-ZH.md index 647b396fb..539a959de 100644 --- a/docs/Sandbox-ZH.md +++ b/docs/Sandbox-ZH.md @@ -1,7 +1,9 @@ # 用户指南 ## 应用和成果 -我们利用Data-Juicer沙盒实验室套件,通过数据与模型间的系统性研发工作流,调优数据和模型,相关工作请参考[论文](http://arxiv.org/abs/2407.11784)。在本工作中,我们在[VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard)文生视频排行榜取得了新的榜首,模型已在[ModelScope](https://modelscope.cn/models/Data-Juicer/Data-Juicer-T2V)和[HuggingFace](https://huggingface.co/datajuicer/Data-Juicer-T2V)平台发布。相关的沙盒实验脚本和数据集正在紧锣密鼓整理中,敬请期待。 +我们利用Data-Juicer沙盒实验室套件,通过数据与模型间的系统性研发工作流,调优数据和模型,相关工作请参考[论文](http://arxiv.org/abs/2407.11784)。在本工作中,我们在[VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard)文生视频排行榜取得了新的榜首。模型已在[ModelScope](https://modelscope.cn/models/Data-Juicer/Data-Juicer-T2V)和[HuggingFace](https://huggingface.co/datajuicer/Data-Juicer-T2V)平台发布,训练模型的[数据集](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_optimal_data_pool.zip)也已开源。 ![top-1_in_vbench](https://img.alicdn.com/imgextra/i3/O1CN01Ssg83y1EPbDgTzexn_!!6000000000344-2-tps-2966-1832.png) +复现论文实验请参考下面的sandbox使用指南,下图的实验流程,[初始数据集](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_init_data_pool.zip),以及该流程的工作流的配置文件demo:[1_single_op_pipline.yaml](../configs/demo/bench/1_single_op_pipline.yaml)、[2_multi_op_pipline.yaml](../configs/demo/bench/2_multi_op_pipline.yaml)、[3_duplicate_pipline.yaml](../configs/demo/bench/3_duplicate_pipline.yaml)。 +![bench_bottom_up](https://img.alicdn.com/imgextra/i3/O1CN01ZwtQuG1sdPnbYYVhH_!!6000000005789-2-tps-7838-3861.png) ## 什么是沙盒实验室(DJ-Sandbox)? 在Data-Juicer中,数据沙盒实验室为用户提供了持续生产数据菜谱的最佳实践,其具有低开销、可迁移、有指导性等特点,用户在沙盒中基于一些小规模数据集、模型对数据菜谱进行快速实验、迭代、优化,再迁移到更大尺度上,大规模生产高质量数据以服务大模型。 @@ -14,9 +16,19 @@ pip install -v -e .[sandbox] ``` -**注意**:一些沙盒的依赖还需要额外的领域依赖。例如,如果用户想要在沙盒中训练一个 ModelScope 平台的NLP模型,那可能需要为 `modelscope` 库 +**注意**:一些沙盒的依赖还需要额外的领域依赖。 + +1. 如果用户想要在沙盒中训练一个 ModelScope 平台的NLP模型,那可能需要为 `modelscope` 库 安装额外的 `nlp` 领域依赖(参考其[安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) )。 -再比如,使用VBench测评视频时需要安装Detectron2,推荐安装如下分支。 + +2. 要使用[EasyAnimate](https://github.com/aigc-apps/EasyAnimate)时需要执行如下安装脚本: +```shell +cd thirdparty/models/ +bash setup_easyanimate.sh +cd ../../ +``` + +3. 使用VBench测评视频时需要安装Detectron2,推荐安装如下分支。 ```shell pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@b7c7f4ba82192ff06f2bbb162b9f67b00ea55867 ``` @@ -99,12 +111,14 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml 在沙盒流水线的单次运行中,包括了四个大的步骤,其中涉及到如下一些可配置组件,他们分别对应了一个用于初始化这些组件的工厂类: - **数据处理(DataExecutor)**:数据处理的执行器,即Data-Juicer的executor +- **数据分析(DataAnalyzer)**:数据分析器,即Data-Juicer的analyzer - **数据评估(DataEvaluator)**:数据集质量的评估器 +- **模型数据评估(ModelInferEvaluator)**:利用模型推理结果的数据集质量的评估器 - **模型训练(ModelTrainExecutor)**:模型训练执行器 - **模型推理(ModelInferExecutor)**:模型推理执行器 - **模型评估(ModelEvaluator)**:模型性能的评估器 -除了DataExecutor,其余组件均可在配置文件中指定`type`参数来选择具体的执行或者评估类型,如数据评估组件支持`type`为`"dj_text_quality_classifier"`来使用Data-Juicer的质量分类器工具来对数据集进行评估,而模型训练组件`type`为`"modelscope"`来训练来自于ModelScope平台的模型。 +除了DataExecutor和DataAnalyzer,其余组件均可在配置文件中指定`type`参数来选择具体的执行或者评估类型,如数据评估组件支持`type`为`"dj_text_quality_classifier"`来使用Data-Juicer的质量分类器工具来对数据集进行评估,而模型训练组件`type`为`"modelscope"`来训练来自于ModelScope平台的模型。 目前支持的组件工厂以及工厂中支持的组件包括: @@ -112,21 +126,28 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml | 组件 | 功能 | `run`方法说明 | 参考材料 | | --- | --- | --- | --- | -| `Gpt3QualityEvaluator` | 使用Data-Juicer复现的GPT-3文本质量分类器对数据集进行质量评估 |
- `eval_type`:该评估器评估对象类型,目前只支持`"data"`
- `eval_obj`:待评估的数据集路径
- 返回值:待评估数据集样本质量打分均值
| [Data-Juicer质量分类器工具集](https://github.com/modelscope/data-juicer/tree/main/tools/quality_classifier) | +| `Gpt3QualityEvaluator` | 使用Data-Juicer复现的GPT-3文本质量分类器对数据集进行质量评估 |
- `eval_type`:该评估器评估对象类型,目前只支持`"data"`
- `eval_obj`:未使用的参数
- 返回值:待评估数据集样本质量打分均值
| [Data-Juicer质量分类器工具集](https://github.com/modelscope/data-juicer/tree/main/tools/quality_classifier) | | `VBenchEvaluator` | 使用VBench对基于prompt生成的视频进行多维度的评估 |
- `eval_type`:该评估器评估对象类型,目前只支持`"data"`
- `eval_obj`:未使用的参数
- 返回值:待评生成视频集各维度打分均值
| [VBench论文](https://arxiv.org/abs/2311.17982) | | `InceptionEvaluator` | 通过视频分类模型抽取特征测评生成的视频 |
- `eval_type`:该评估器评估对象类型,目前只支持`"data"`
- `eval_obj`:未使用的参数
- 返回值:根据给定的metric返回对应的字典
| [Inception Metrics](https://github.com/NVlabs/long-video-gan/tree/main/metrics) | +- 模型数据评估工厂 -- ModelInferEvaluatorFactory + +| 组件 | 功能 | `run`方法说明 | 参考材料 | +| --- | --- | --- | --- | +| `ModelscopeInferProbeExecutor` | 用数据集对ModelScope平台上的模型进行推理,并返回推理结果 |
- `run_type`:推理类型。需要在组件配置文件中设置`type`参数为`"modelscope"`来激活该组件
- `run_obj`:需要送入模型推理的采样数据集
| [ModelScope模型推理文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) | + - 模型训练工厂 -- ModelTrainExecutorFactory | 组件 | 功能 | `run`方法说明 | 参考材料 | | --- | --- | --- | --- | -| `ModelscopeTrainExecutor` | 用数据集对ModelScope平台上的模型进行训练任务,并监测loss变化信息 |
- `run_type`:训练模型类型。需要在组件配置文件中设置`type`参数为`"modelscope"`来激活该组件
- `run_obj`:额外训练配置。除了组件配置之外的额外配置信息,包括数据集路径以及存放训练产出的工作路径等,由于他们会随着流水线运行发生变化,因此他们会在流水线中动态设置
| [ModelScope模型训练文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) | +| `ModelscopeTrainExecutor` | 用Data-Juicer产出的数据集训练ModelScope平台上的模型,并监测loss变化信息 |
- `run_type`:训练模型类型。需要在组件配置文件中设置`type`参数为`"modelscope"`来激活该组件
- `run_obj`:未使用的参数
| [ModelScope模型训练文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) | +| `EasyAnimateTrainExecutor` | 用Data-Juicer产出的数据集训练文生视频模型EasyAnimate的LoRA模型,并监测loss变化信息 |
- `run_type`:训练模型类型。需要在组件配置文件中设置`type`参数为`"easyanimate"`来激活该组件
- `run_obj`:未使用的参数
| [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) | - 模型推理工厂 -- ModelInferExecutorFactory | 组件 | 功能 | `run`方法说明 | 参考材料 | | --- | --- | --- | --- | -| `ModelscopeInferExecutor` | 用数据集对ModelScope平台上的模型进行推理,并返回推理结果 |
- `run_type`:推理类型。需要在组件配置文件中设置`type`参数为`"infer_on_data"`来激活该组件
- `run_obj`:需要送入模型推理的采样数据集
| [ModelScope模型推理文档](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) | +| `EasyAnimateInferExecutor` | 用VBench的prompt数据集对EasyAnimate模型进行推理,并存储生成的视频 |
- `run_type`:推理类型。需要在组件配置文件中设置`type`参数为`"easyanimate"`来激活该组件
- `run_obj`:未使用的参数
| [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) | - 模型评估工厂 -- ModelEvaluatorFactory - TBD @@ -134,7 +155,7 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml 详细定义可参考`data_juicer/core/sandbox/factories.py`。 # 开发者指南 正如上一章节所说,开发者可开发更多的可配置组件并将它们添加到对应的工厂类中,并用参数`type`进行实例化方法分配。实现了组件后,开发者可以将它们封装为钩子,并将钩子注册到工作列表中,工作列表在流水线中进行编排后,沙盒流水线执行时,会依次在每个步骤执行每个工作列表中的工作。这其中的每一个部分:组件、组件工厂、钩子、工作列表、流水线注册与执行流程编排,都可以由开发者自定义。各个部分的关系由下图示意。 -![sandbox-pipeline](https://img.alicdn.com/imgextra/i1/O1CN01JsgSuu22ycGdJFRdc_!!6000000007189-2-tps-3640-2048.png) +![sandbox-pipeline](https://img.alicdn.com/imgextra/i2/O1CN01B3zR0t29noFoHGsyq_!!6000000008113-2-tps-3878-2212.png) ## 组件内部实现 目前组件主要分为两个大类: @@ -173,13 +194,14 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml | 钩子 | 功能 | 依赖的组件工厂 | 依赖的工具或库 | 注册工作列表 | | --- | --- | --- | --- | --- | -| `ProbeViaAnalyzerHook` | 分析与洞察数据集质量、多样性等维度分布 | - | Data-Juicer分析器Analyzer | 洞察工作列表(probe_jobs)
评估工作列表(evaluation_jobs) | -| `ProbeViaModelInferHook` | 分析与洞察数据集对于模型的影响,挖掘与洞察“难”数据与“脏”数据 | 模型推理工厂(ModelInferExecutorFactory) | - | 洞察工作列表(probe_jobs)
评估工作列表(evaluation_jobs) | +| `ProbeViaAnalyzerHook` | 分析与洞察数据集质量、多样性等维度分布 | 数据分析工厂(DataAnalyzerFactory) | Data-Juicer分析器Analyzer | 洞察工作列表(probe_jobs)
评估工作列表(evaluation_jobs) | +| `ProbeViaModelInferHook` | 分析与洞察数据集对于模型的影响,挖掘与洞察“难”数据与“脏”数据 | 数据处理工厂(DataExecutorFactor)
模型数据评估工厂(ModelInferEvaluatorFactory) | Data-Juicer数据处理器Executor | 洞察工作列表(probe_jobs)
评估工作列表(evaluation_jobs) | | `RefineRecipeViaKSigmaHook` | 根据数据集洞察结果,利用k-sigma方法对数据菜谱超参进行微调 | - | Data-Juicer超参优化工具HPO中的k-sigma菜谱微调工具 | 菜谱微调工作列表(refine_recipe_jobs) | | `RefineRecipeViaModelFeedbackHook` | 利用模型洞察与反馈结果对数据菜谱超参进行微调 | TODO | - | 菜谱微调工作列表(refine_recipe_jobs) | -| `ProcessDataHook` | 基于当前数据菜谱对数据集进行处理与清洗 | - | Data-Juicer数据处理器Executor | 执行工作列表(execution_jobs) | -| `TrainModelHook` | 基于当前数据集训练一个模型 | 模型训练工厂(ModelTrainExecutorFactory) | - | 执行工作列表(execution_jobs) | -| `EvaluateDataHook` | 对当前数据集进行数据质量等维度的评估 | 数据评估工厂(DataEvaluatorFactory) | - | 评估工作列表(evaluation_jobs) | +| `ProcessDataHook` | 基于当前数据菜谱对数据集进行处理与清洗 | 数据处理工厂(DataExecutorFactor) | Data-Juicer数据处理器Executor | 执行工作列表(execution_jobs) | +| `TrainModelHook` | 基于当前数据集训练一个模型 | 模型训练工厂(ModelTrainExecutorFactory) | [EasyAnimate](../thirdparty//easy_animate/README.md) | 执行工作列表(execution_jobs) | +| `InferModelHook` | 模型基于给定输入让模型产生输出 | 模型推理工厂(ModelInferExecutorFactory) | [EasyAnimate](../thirdparty//easy_animate/README.md) | 执行工作列表(execution_jobs) | +| `EvaluateDataHook` | 对当前数据集进行数据质量等维度的评估 | 数据评估工厂(DataEvaluatorFactory) | 图像或视频的[inception metrics](../tools/mm_eval/inception_metrics/README_ZH.md),如FID、FVD
[VBench](../tools/mm_eval/vbench_metrics/README_ZH.md) | 评估工作列表(evaluation_jobs) | | `EvaluateModelHook` | 对当前训练后的模型进行评估 | 模型评估工厂(ModelEvaluatorFactory) | - | 评估工作列表(evaluation_jobs) | 值得注意的是,一个钩子可以在多个工作列表进行注册,因为这个钩子在不同的流水线阶段可以扮演不同的角色,比如我们可以对处理前后的数据集都进行分析,以比较数据集处理前后的质量、多样性等维度的变化情况。 diff --git a/docs/Sandbox.md b/docs/Sandbox.md index 6e12afadf..9e45f4ca6 100644 --- a/docs/Sandbox.md +++ b/docs/Sandbox.md @@ -1,7 +1,9 @@ # User Guide ## Applications and Achievements -Leveraging the Data-Juicer Sandbox Laboratory Suite, we systematically fine-tuned data and models through a dedicated research and development workflow between data and models. For more detailed information, please refer to our [paper](http://arxiv.org/abs/2407.11784). In our work, we have secured a new leading position on the [VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard) text-to-video leaderboard, and the model is now publicly available on the [ModelScope](https://modelscope.cn/models/Data-Juicer/Data-Juicer-T2V) and [HuggingFace](https://huggingface.co/datajuicer/Data-Juicer-T2V) platforms. The associated sandbox experiment scripts and datasets are being actively prepared for release, so please look forward to it. +Leveraging the Data-Juicer Sandbox Laboratory Suite, we systematically fine-tuned data and models through a dedicated research and development workflow between data and models. For more detailed information, please refer to our [paper](http://arxiv.org/abs/2407.11784). In our work, we have secured a new leading position on the [VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard) text-to-video leaderboard. The model is now publicly available on the [ModelScope](https://modelscope.cn/models/Data-Juicer/Data-Juicer-T2V) and [HuggingFace](https://huggingface.co/datajuicer/Data-Juicer-T2V) platforms, and the training [dataset](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_optimal_data_pool.zip) has also been available. ![top-1_in_vbench](https://img.alicdn.com/imgextra/i3/O1CN01Ssg83y1EPbDgTzexn_!!6000000000344-2-tps-2966-1832.png) +To reproduce the paper's experiments, please refer to the sandbox usage guide below, the experimental process in the following figure, the [initial dataset](http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/MM_data/our_refined_data/Data-Juicer-T2V/data_juicer_t2v_init_data_pool.zip), and the configuration file demos for the process: [1_single_op_pipline.yaml](../configs/demo/bench/1_single_op_pipline.yaml), [2_multi_op_pipline.yaml](../configs/demo/bench/2_multi_op_pipline.yaml), [3_duplicate_pipline.yaml](../configs/demo/bench/3_duplicate_pipline.yaml). +![bench_bottom_up](https://img.alicdn.com/imgextra/i3/O1CN01ZwtQuG1sdPnbYYVhH_!!6000000005789-2-tps-7838-3861.png) ## What is DJ-Sandbox? In Data-Juicer, the data sandbox laboratory provides users with the best practices for continuously producing data recipes. It features low overhead, portability, and guidance. In the sandbox, users can quickly experiment, iterate, and refine data recipes based on small-scale datasets and models, before scaling up to produce high-quality data to serve large-scale models. @@ -15,9 +17,19 @@ pip install -v -e .[sandbox] ``` -**NOTICE**: some sandbox-related dependencies require extra domain dependencies. For example, if users want to train an NLP model from ModelScope +**NOTICE**: some sandbox-related dependencies require extra domain dependencies. + +1. If users want to train an NLP model from ModelScope in the sandbox, you might need to install extra `nlp` dependencies for `modelscope` library (see the [installation docs](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)). -For example, when using VBench to benchmark videos, it is necessary to install Detectron2. The following branch is recommended for installation. + +2. To use [EasyAnimate](https://github.com/aigc-apps/EasyAnimate), you need to execute the following installation script: +```shell +cd thirdparty/models/ +bash setup_easyanimate.sh +cd ../../ +``` + +3. When using VBench to benchmark videos, it is necessary to install Detectron2. The following branch is recommended for installation. ```shell pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@b7c7f4ba82192ff06f2bbb162b9f67b00ea55867 ``` @@ -100,12 +112,14 @@ If the `hpo_config` is set in the configuration file and appropriate optimizatio In a single trial of the sandbox pipeline, four major steps involve various configurable components. Each of these components corresponds to a factory class used to initialize them: - **Data Processing (DataExecutor)**: Executor for dataset processing, i.e., the executor of Data-Juicer +- **Data Analyzing(DataAnalyzer)**: Analyzer for dataset, i.e., the analyzer of Data-Juicer - **Data Evaluation (DataEvaluator)**: Evaluator on the quality of the dataset +- **Model-Data Evaluation(ModelInferEvaluator)**: Evaluator of dataset quality using the model's inference results - **Model Training (ModelTrainExecutor)**: Executor for model training - **Model Inference (ModelInferExecutor)**: Executor for model inference - **Model Evaluation (ModelEvaluator)**: Evaluator on the performance of the model -Except for DataExecutor, the rest of the components can be specified in the configuration file using the `type` parameter to choose a specific execution or evaluation type. For example, the data evaluation component supports a `type` of `"dj_text_quality_classifier"` to utilize Data-Juicer's text quality classifier tool for evaluating the dataset, while the model training component `type` can be set to `"modelscope"` to train a model from the ModelScope platform. +Except for DataExecutor and DataAnalyzer, the rest of the components can be specified in the configuration file using the `type` parameter to choose a specific execution or evaluation type. For example, the data evaluation component supports a `type` of `"dj_text_quality_classifier"` to utilize Data-Juicer's text quality classifier tool for evaluating the dataset, while the model training component `type` can be set to `"modelscope"` to train a model from the ModelScope platform. The currently supported component factories and the components supported within each factory are as follows: @@ -113,21 +127,28 @@ The currently supported component factories and the components supported within | Component | Function | Desc. of Method `run` | Reference Materials | | --- | --- | --- | --- | -| `Gpt3QualityEvaluator` | Evaluate the quality of a dataset using the GPT-3 text quality classifier reproduced by Data-Juicer. |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`.
- `eval_obj`: The path to the dataset to be evaluated.
- Return: The average quality score of the dataset samples.
| [Data-Juicer Quality Classifier Toolkit](https://github.com/modelscope/data-juicer/tree/main/tools/quality_classifier) | -| `VBenchEvaluator` | Evaluate the generated videos according to given prompts in multi dimensions |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`
- `eval_obj`: A useless parameter
- Return: The average score of generated videos in multi dimensions.
| [VBench paper](https://arxiv.org/abs/2311.17982) | -| `InceptionEvaluator` | Evaluate the generated videos by features extracted from video classification models. |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`
- `eval_obj`: A useless parameter
- Return: A dictionary of scores in the given metric.
| [Inception Metrics](https://github.com/NVlabs/long-video-gan/tree/main/metrics) | +| `Gpt3QualityEvaluator` | Evaluate the quality of a dataset using the GPT-3 text quality classifier reproduced by Data-Juicer. |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`.
- `eval_obj`: An useless parameter.
| [Data-Juicer Quality Classifier Toolkit](https://github.com/modelscope/data-juicer/tree/main/tools/quality_classifier) | +| `VBenchEvaluator` | Evaluate the generated videos according to given prompts in multi dimensions |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`
- `eval_obj`: An useless parameter.
- Return: The average score of generated videos in multi dimensions.
| [VBench paper](https://arxiv.org/abs/2311.17982) | +| `InceptionEvaluator` | Evaluate the generated videos by features extracted from video classification models. |
- `eval_type`: The type of the object to be evaluated by the evaluator, currently only supports `"data"`
- `eval_obj`: An useless parameter.
- Return: A dictionary of scores in the given metric.
| [Inception Metrics](https://github.com/NVlabs/long-video-gan/tree/main/metrics) | + +- ModelInferEvaluatorFactory + +| Component | Function | Desc. of Method `run` | Reference Materials | +| --- | --- | --- | --- | +| `ModelscopeInferExecutor` | Perform inference on a model from the ModelScope platform using a specified sampled dataset, and return the inference results. |
- `run_type`: Type of model inference. We need to set `type` arg as `"modelscope"` in the component configuration file to activate this component.
- `run_obj`: Sampled dataset to be fed into model inference.
| [ModelScope Docs of Model Inference](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) | - ModelTrainExecutorFactory | Component | Function | Desc. of Method `run` | Reference Materials | | --- | --- | --- | --- | -| `ModelscopeTrainExecutor` | Perform a training task on a model from the ModelScope platform using specified datasets, and monitor the change in training loss. |
- `run_type`: Type of model training. We need to set `type` arg as `"modelscope"` in the component configuration file to activate this component.
- `run_obj`: Additional training configurations. Apart from the component configuration, this includes the dataset paths and the working directory for storing the training output. As they may change during the pipeline run, they are set dynamically within the pipeline.
| [ModelScope Docs of Model Training](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) | +| `ModelscopeTrainExecutor` | Perform a training task on a model from the ModelScope platform using specified datasets, and monitor the change in training loss. |
- `run_type`: Type of model training. We need to set `type` arg as `"modelscope"` in the component configuration file to activate this component.
- `run_obj`: An useless parameter.
| [ModelScope Docs of Model Training](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) | +| `EasyAnimateTrainExecutor` | Perform a LoRA training task on EasyAnimate text-to-video model, and monitor the change in training loss. |
- `run_type`: Type of model training. We need to set `type` arg as `"easyanimate"` in the component configuration file to activate this component.
- `run_obj`: An useless parameter.
| [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) | - ModelInferExecutorFactory | Component | Function | Desc. of Method `run` | Reference Materials | | --- | --- | --- | --- | -| `ModelscopeInferExecutor` | Perform inference on a model from the ModelScope platform using a specified sampled dataset, and return the inference results. |
- `run_type`: Type of model inference. We need to set `type` arg as `"infer_on_data"` in the component configuration file to activate this component.
- `run_obj`: Sampled dataset to be fed into model inference.
| [ModelScope Docs of Model Inference](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) | +| `EasyAnimateInferExecutor` | Perform inference on EasyAnimate text-to-video model with the prompts from VBench, and save the generated videos. |
- `run_type`: Type of model inference. We need to set `type` arg as `"easyanimate"` in the component configuration file to activate this component.
- `run_obj`: An useless parameter.
| [EasyAnimate](https://github.com/aigc-apps/EasyAnimate) | - ModelEvaluatorFactory - TBD @@ -135,7 +156,7 @@ The currently supported component factories and the components supported within Please refer to `data_juicer/core/sandbox/factories.py` for detailed definitions. # Developer Guide As mentioned in the previous section, developers can develop customized configurable components and add them to the corresponding factory classes, then route to appropriate instantiation methods using the `type` parameter. Once the components are implemented, developers can encapsulate them as hooks and register the hooks into the job list. After the job list is orchestrated in the pipeline, when the sandbox pipeline is executed, each job in the job list will be executed in sequence at each step. Each of these parts - components, component factory, hooks, job lists, and the registration and execution orchestration of the pipeline - can be customized by the developer. The relationship among these parts is illustrated in the diagram below. -![sandbox-pipeline](https://img.alicdn.com/imgextra/i1/O1CN01JsgSuu22ycGdJFRdc_!!6000000007189-2-tps-3640-2048.png) +![sandbox-pipeline](https://img.alicdn.com/imgextra/i2/O1CN01B3zR0t29noFoHGsyq_!!6000000008113-2-tps-3878-2212.png) ## The Internal Implementation of Components Currently, components are mainly divided into two major categories: @@ -173,14 +194,15 @@ In general, we only need to implement one type of hook function for a type of co | Hook | Function | Dependent Component Factory | Dependent Tool or Library | Registered Job List | | --- | --- | --- | --- | --- | -| `hook_probe_via_analyzer` | Analyze and probe the quality and diversity distribution of the dataset | - | Data-Juicer Analyzer |
- probe_jobs
- evaluation_jobs
| -| `hook_probe_via_model_infer` | Analyze and understand the impact of the dataset on the model, explore and probe "difficult" and "dirty" data | ModelInferExecutorFactory | - |
- probe_jobs
- evaluation_jobs
| -| `hook_refine_recipe_via_k_sigma` | Refine data recipe hyperparameters using the k-sigma method based on the probe results of the dataset | - | k-sigma recipe refinement tool of Data-Juicer Hyperparameter Optimization (HPO) toolkit |
- refine_recipe_jobs
| -| `hook_refine_recipe_via_model_feedback` | Refine data recipe hyperparameters using model probe and feedback results | TODO | - |
- refine_recipe_jobs
| -| `hook_process_data` | Process and clean the dataset based on the current data recipe | - | Data-Juicer Executor |
- execution_jobs
| Always | -| `hook_train_model` | Train a model based on the current dataset | ModelTrainExecutorFactory | - |
- execution_jobs
| -| `hook_evaluate_data` | Evaluate the dataset in terms of data quality and other dimensions | DataEvaluatorFactory | - |
- evaluation_jobs
| -| `hook_evaluate_model` | Evaluate the trained model | ModelEvaluatorFactory | - |
- evaluation_jobs
| +| `ProbeViaAnalyzerHook` | Analyze and probe the quality and diversity distribution of the dataset | DataAnalyzerFactory | Data-Juicer Analyzer | - probe_jobs
- evaluation_jobs | +| `ProbeViaModelInferHook` | Analyze and understand the impact of the dataset on the model, explore and probe "difficult" and "dirty" data | DataExecutorFactor
ModelInferEvaluatorFactory | Data-Juicer Executor | - probe_jobs
- evaluation_jobs | +| `RefineRecipeViaKSigmaHook` | Refine data recipe hyperparameters using the k-sigma method based on the probe results of the dataset | - | k-sigma recipe refinement tool of Data-Juicer Hyperparameter Optimization (HPO) toolkit | - refine_recipe_jobs | +| `RefineRecipeViaModelFeedbackHook` | Refine data recipe hyperparameters using model probe and feedback results | TODO | - | - refine_recipe_jobs | +| `ProcessDataHook` | Process and clean the dataset based on the current data recipe | DataExecutorFactor | Data-Juicer Executor | - execution_jobs | Always | +| `TrainModelHook` | Train a model based on the current dataset | ModelTrainExecutorFactory | [EasyAnimate](../thirdparty//easy_animate/README.md) | - execution_jobs | +| `InferModelHook` | The model generates output based on the given input | ModelInferExecutorFactory | [EasyAnimate](../thirdparty//easy_animate/README.md) | - execution_jobs | +| `EvaluateDataHook` | Evaluate the dataset in terms of data quality and other dimensions | DataEvaluatorFactory | [inception metrics](../tools/mm_eval/inception_metrics/README.md) for images and videos, such as FID and FVD
[VBench](../tools/mm_eval/vbench_metrics/README.md) | - evaluation_jobs | +| `EvaluateModelHook` | Evaluate the trained model | ModelEvaluatorFactory | - | - evaluation_jobs | It is worth noting that a hook can be registered in multiple job lists, as this hook can play different roles in different steps of the pipeline. For example, we can analyze and probe both the pre-processed and post-processed datasets to compare the changes in quality, diversity, and other dimensions before and after data processing. diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt index 4c60fe9e3..7f1d27a25 100644 --- a/environments/sandbox_requires.txt +++ b/environments/sandbox_requires.txt @@ -1,4 +1,4 @@ -torch>=1.11.0,<2.0.0 +torch>=1.11.0 wandb fire pyspark @@ -6,3 +6,18 @@ pyspark vbench # modelscope-related modelscope +# easyanimate-related +Pillow +deepspeed +safetensors +timm +xformers +decord +opencv-python +omegaconf +imageio[ffmpeg] +imageio[pyav] +tensorboard +diffusers==0.27.0 +transformers==4.37.2 +func_timeout diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 0060ffeeb..e848ea5ba 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,4 +1,4 @@ -torch>=1.11.0,<2.0.0 +torch>=1.11.0 torchaudio easyocr fasttext-wheel diff --git a/scripts/README.md b/scripts/README.md index 199753af1..5eb0a1f3f 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -11,4 +11,4 @@ The scripts to run are in ./dlc folder. We provide scripts to support running on slurm, see ./run_slurm.sh. -You can also manually partition the data according to specific circumstances and then use Slurm to run it on multiple machines by yourself. \ No newline at end of file +You can also manually partition the data according to specific circumstances and then use Slurm to run it on multiple machines by yourself. diff --git a/tests/ops/selector/test_random_selector.py b/tests/ops/selector/test_random_selector.py new file mode 100644 index 000000000..60f123be9 --- /dev/null +++ b/tests/ops/selector/test_random_selector.py @@ -0,0 +1,274 @@ +import unittest + +from datasets import Dataset + +from data_juicer.ops.selector.random_selector import RandomSelector +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class RandomSelectorTest(DataJuicerTestCaseBase): + + def _run_random_selector(self, dataset: Dataset, target_num, op): + dataset = op.process(dataset) + res_list = dataset.to_list() + self.assertEqual(len(res_list), target_num) + + def test_ratio_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 34 + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 551 + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': 89 + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 33 + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }] + tgt_num = 2 + dataset = Dataset.from_list(ds_list) + op = RandomSelector(select_ratio=0.2, + select_num=5) + self._run_random_selector(dataset, tgt_num, op) + + def test_num_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 34 + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 551 + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': 89 + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 33 + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }] + tgt_num = 4 + dataset = Dataset.from_list(ds_list) + op = RandomSelector(select_ratio=0.5, + select_num=4) + self._run_random_selector(dataset, tgt_num, op) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/selector/test_range_specified_field_selector.py b/tests/ops/selector/test_range_specified_field_selector.py new file mode 100644 index 000000000..b0dd77a1e --- /dev/null +++ b/tests/ops/selector/test_range_specified_field_selector.py @@ -0,0 +1,641 @@ +import unittest + +from datasets import Dataset + +from data_juicer.ops.selector.range_specified_field_selector import \ + RangeSpecifiedFieldSelector +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class RangeSpecifiedFieldSelectorTest(DataJuicerTestCaseBase): + + def _run_range_selector(self, dataset: Dataset, target_list, op): + dataset = op.process(dataset) + res_list = dataset.to_list() + res_list = sorted(res_list, key=lambda x: x['text']) + target_list = sorted(target_list, key=lambda x: x['text']) + self.assertEqual(res_list, target_list) + + def test_percentile_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 34 + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 551 + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': 89 + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 33 + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }] + tgt_list = [{ + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }] + dataset = Dataset.from_list(ds_list) + op = RangeSpecifiedFieldSelector(field_key='meta.key1.count', + lower_percentile=0.78, + upper_percentile=0.9, + lower_rank=5, + upper_rank=10) + self._run_range_selector(dataset, tgt_list, op) + + def test_rank_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 34 + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 551 + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': 89 + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 33 + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }] + tgt_list = [{ + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }] + dataset = Dataset.from_list(ds_list) + op = RangeSpecifiedFieldSelector(field_key='meta.key1.key2.count', + lower_percentile=0.3, + upper_percentile=1.0, + lower_rank=7, + upper_rank=9) + self._run_range_selector(dataset, tgt_list, op) + + def test_percentile_rank_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 34 + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': 243 + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 18 + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 551 + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': 89 + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': 33 + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': 2 + }, + 'count': 48 + } + } + }] + tgt_list = [{ + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': 354.32 + }, + 'count': 32 + } + } + }] + dataset = Dataset.from_list(ds_list) + op = RangeSpecifiedFieldSelector(field_key='meta.key1.key2.count', + lower_percentile=0.7, + upper_percentile=1.0, + lower_rank=3, + upper_rank=9) + self._run_range_selector(dataset, tgt_list, op) + + def test_list_select(self): + ds_list = [{ + 'text': 'Today is Sun', + 'count': 101, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': [34.0] + }, + 'count': 5 + } + } + }, { + 'text': 'a v s e c s f e f g a a a ', + 'count': 16, + 'meta': { + 'suffix': '.docx', + 'key1': { + 'key2': { + 'count': [243.0] + }, + 'count': 63 + } + } + }, { + 'text': '中文也是一个字算一个长度', + 'count': 162, + 'meta': { + 'suffix': '.txt', + 'key1': { + 'key2': { + 'count': [] + }, + 'count': 23 + } + } + }, { + 'text': ',。、„”“«»1」「《》´∶:?!', + 'count': None, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': None + }, + 'count': 48 + } + } + }, { + 'text': '他的英文名字叫Harry Potter', + 'count': 88, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': [551.0] + }, + 'count': 78 + } + } + }, { + 'text': '这是一个测试', + 'count': None, + 'meta': { + 'suffix': '.py', + 'key1': { + 'key2': { + 'count': [89.0] + }, + 'count': 3 + } + } + }, { + 'text': '我出生于2023年12月15日', + 'count': None, + 'meta': { + 'suffix': '.java', + 'key1': { + 'key2': { + 'count': [354.32] + }, + 'count': 67 + } + } + }, { + 'text': 'emoji表情测试下😊,😸31231\n', + 'count': 2, + 'meta': { + 'suffix': '.html', + 'key1': { + 'key2': { + 'count': [354.32] + }, + 'count': 32 + } + } + }, { + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': [33.0, 33.0] + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': [2.0, 2.0] + }, + 'count': 48 + } + } + }] + tgt_list = [{ + 'text': 'a=1\nb\nc=1+2+3+5\nd=6', + 'count': 178, + 'meta': { + 'suffix': '.pdf', + 'key1': { + 'key2': { + 'count': [33.0, 33.0] + }, + 'count': 33 + } + } + }, { + 'text': '使用片段分词器对每个页面进行分词,使用语言', + 'count': 666, + 'meta': { + 'suffix': '.xml', + 'key1': { + 'key2': { + 'count': [2.0, 2.0] + }, + 'count': 48 + } + } + }] + dataset = Dataset.from_list(ds_list) + op = RangeSpecifiedFieldSelector(field_key='meta.key1.key2.count', + lower_percentile=0.0, + upper_percentile=0.5, + lower_rank=2, + upper_rank=4) + self._run_range_selector(dataset, tgt_list, op) + + +if __name__ == '__main__': + unittest.main() diff --git a/thirdparty/README.md b/thirdparty/LLM_ecosystems/README.md similarity index 97% rename from thirdparty/README.md rename to thirdparty/LLM_ecosystems/README.md index 4b8ed06c0..a86bce661 100644 --- a/thirdparty/README.md +++ b/thirdparty/LLM_ecosystems/README.md @@ -1,4 +1,4 @@ -# Third-parties (LLM Ecosystems) +# LLM Ecosystems Dependencies of Auto Evaluation Toolkit, see [`tools/evaluator/README.md`](../tools/evaluator/README.md) for more details. diff --git a/thirdparty/README_ZH.md b/thirdparty/LLM_ecosystems/README_ZH.md similarity index 96% rename from thirdparty/README_ZH.md rename to thirdparty/LLM_ecosystems/README_ZH.md index 456d4573b..fcdce2d23 100644 --- a/thirdparty/README_ZH.md +++ b/thirdparty/LLM_ecosystems/README_ZH.md @@ -1,4 +1,4 @@ -# 第三方库(大语言模型生态) +# 大语言模型生态 本目录包含了 Auto Evaluation Toolkit 的第三方依赖项,更多细节请参考 `tools/evaluator/README_ZH.md`。 diff --git a/thirdparty/patch/helm.diff b/thirdparty/LLM_ecosystems/patch/helm.diff similarity index 100% rename from thirdparty/patch/helm.diff rename to thirdparty/LLM_ecosystems/patch/helm.diff diff --git a/thirdparty/patch/megatron.diff b/thirdparty/LLM_ecosystems/patch/megatron.diff similarity index 100% rename from thirdparty/patch/megatron.diff rename to thirdparty/LLM_ecosystems/patch/megatron.diff diff --git a/thirdparty/setup_helm.sh b/thirdparty/LLM_ecosystems/setup_helm.sh similarity index 100% rename from thirdparty/setup_helm.sh rename to thirdparty/LLM_ecosystems/setup_helm.sh diff --git a/thirdparty/setup_megatron.sh b/thirdparty/LLM_ecosystems/setup_megatron.sh similarity index 100% rename from thirdparty/setup_megatron.sh rename to thirdparty/LLM_ecosystems/setup_megatron.sh diff --git a/thirdparty/models/README.md b/thirdparty/models/README.md new file mode 100644 index 000000000..40ba14d40 --- /dev/null +++ b/thirdparty/models/README.md @@ -0,0 +1,9 @@ +# Third-party Model Library + +## EasyAnimate + +Install [EasyAnimate](https://github.com/aigc-apps/EasyAnimate): + +```shell +bash ./setup_easyanimate.sh +``` diff --git a/thirdparty/models/README_ZH.md b/thirdparty/models/README_ZH.md new file mode 100644 index 000000000..19c333145 --- /dev/null +++ b/thirdparty/models/README_ZH.md @@ -0,0 +1,9 @@ +# 第三方模型库 + +## EasyAnimate + +安装[EasyAnimate](https://github.com/aigc-apps/EasyAnimate): + +```shell +bash ./setup_easyanimate.sh +``` diff --git a/thirdparty/models/patch/easyanimate.diff b/thirdparty/models/patch/easyanimate.diff new file mode 100644 index 000000000..ad635440d --- /dev/null +++ b/thirdparty/models/patch/easyanimate.diff @@ -0,0 +1,1852 @@ +diff --git a/easyanimate/data/dataset_video.py b/easyanimate/data/dataset_video.py +index c78367d..70adb0c 100644 +--- a/easyanimate/data/dataset_video.py ++++ b/easyanimate/data/dataset_video.py +@@ -162,7 +162,13 @@ class VideoDataset(Dataset): + enable_bucket=False, enable_inpaint=False + ): + print(f"loading annotations from {json_path} ...") +- self.dataset = json.load(open(json_path, 'r')) ++ ++ if json_path.lower().endswith('jsonl'): ++ with open(json_path, 'r') as fin: ++ self.dataset = [json.loads(line.strip()) for line in fin] ++ else: ++ self.dataset = json.load(open(json_path, 'r')) ++ + self.length = len(self.dataset) + print(f"data scale: {self.length}") + +@@ -183,38 +189,54 @@ class VideoDataset(Dataset): + + def get_batch(self, idx): + video_dict = self.dataset[idx] +- video_id, name = video_dict['file_path'], video_dict['text'] ++ video_id, name = video_dict['videos'], video_dict['text'] + +- if self.video_folder is None: +- video_dir = video_id +- else: +- video_dir = os.path.join(self.video_folder, video_id) ++ if isinstance(video_id, list): ++ video_id = video_id[0] + +- with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader: +- video_length = len(video_reader) +- +- clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1) +- start_idx = random.randint(0, video_length - clip_length) +- batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int) +- +- try: +- sample_args = (video_reader, batch_index) +- pixel_values = func_timeout( +- VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args ++ video_dir = video_id ++ if not os.path.exists(video_id): ++ if self.video_folder: ++ video_dir = os.path.join(self.video_folder, video_id) ++ else: ++ raise ValueError( ++ f"{video_id} does not exist, please change it to absolute path or ser video_folder" + ) +- except FunctionTimedOut: +- raise ValueError(f"Read {idx} timeout.") +- except Exception as e: +- raise ValueError(f"Failed to extract frames from video. Error is {e}.") + +- if not self.enable_bucket: +- pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous() +- pixel_values = pixel_values / 255. +- del video_reader +- else: +- pixel_values = pixel_values ++ video_reader = VideoReader(video_dir) ++ video_length = len(video_reader) ++ ++ clip_length = min(video_length, ++ (self.sample_n_frames - 1) * self.sample_stride + 1) ++ start_idx = random.randint(0, video_length - clip_length) ++ batch_index = np.linspace(start_idx, ++ start_idx + clip_length - 1, ++ self.sample_n_frames, ++ dtype=int) ++ ++ try: ++ sample_args = (video_reader, batch_index) ++ pixel_values = func_timeout( ++ VIDEO_READER_TIMEOUT, ++ get_video_reader_batch, ++ args=sample_args ++ ) ++ except FunctionTimedOut: ++ raise ValueError(f"Read {idx} timeout.") ++ except Exception as e: ++ raise ValueError(f"Failed to extract frames from video. Error is {e}.") + +- return pixel_values, name ++ if not self.enable_bucket: ++ pixel_values = torch.from_numpy(pixel_values).permute( ++ 0, 3, 1, 2).contiguous() ++ pixel_values = pixel_values / 255. ++ del video_reader ++ ++ # remove special token ++ name = name.replace('<__dj__video>', '').replace('<|__dj__eoc|>', ++ '').strip() ++ ++ return pixel_values, name + + def __len__(self): + return self.length +@@ -259,4 +281,4 @@ if __name__ == "__main__": + + dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,) + for idx, batch in enumerate(dataloader): +- print(batch["pixel_values"].shape, len(batch["text"])) +\ No newline at end of file ++ print(batch["pixel_values"].shape, len(batch["text"])) +diff --git a/easyanimate/models/attention.py b/easyanimate/models/attention.py +index 00f1b83..bf05590 100644 +--- a/easyanimate/models/attention.py ++++ b/easyanimate/models/attention.py +@@ -28,10 +28,12 @@ if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version + else: + from diffusers.models.attention_processor import Attention, AttnProcessor2_0 + ++from diffusers.models.activations import GEGLU, GELU, ApproximateGELU + from diffusers.models.attention import AdaLayerNorm, FeedForward + from diffusers.models.attention_processor import (Attention, AttnProcessor2_0, + HunyuanAttnProcessor2_0) + from diffusers.models.embeddings import SinusoidalPositionalEmbedding ++from diffusers.models.lora import LoRACompatibleLinear + from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero + from diffusers.utils import USE_PEFT_BACKEND + from diffusers.utils.import_utils import is_xformers_available +@@ -1844,3 +1846,60 @@ class HunyuanTemporalTransformerBlock(nn.Module): + hidden_states = hidden_states + self.ff(mlp_inputs) + + return hidden_states ++ ++ ++class FeedForward(nn.Module): ++ r""" ++ A feed-forward layer. ++ ++ Parameters: ++ dim (`int`): The number of channels in the input. ++ dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. ++ mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. ++ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. ++ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. ++ final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. ++ """ ++ ++ def __init__( ++ self, ++ dim: int, ++ dim_out: Optional[int] = None, ++ mult: int = 4, ++ dropout: float = 0.0, ++ activation_fn: str = "geglu", ++ final_dropout: bool = False, ++ ): ++ super().__init__() ++ inner_dim = int(dim * mult) ++ dim_out = dim_out if dim_out is not None else dim ++ linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear ++ ++ if activation_fn == "gelu": ++ act_fn = GELU(dim, inner_dim) ++ if activation_fn == "gelu-approximate": ++ act_fn = GELU(dim, inner_dim, approximate="tanh") ++ elif activation_fn == "geglu": ++ act_fn = GEGLU(dim, inner_dim) ++ elif activation_fn == "geglu-approximate": ++ act_fn = ApproximateGELU(dim, inner_dim) ++ ++ self.net = nn.ModuleList([]) ++ # project in ++ self.net.append(act_fn) ++ # project dropout ++ self.net.append(nn.Dropout(dropout)) ++ # project out ++ self.net.append(linear_cls(inner_dim, dim_out)) ++ # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout ++ if final_dropout: ++ self.net.append(nn.Dropout(dropout)) ++ ++ def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor: ++ compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear) ++ for module in self.net: ++ if isinstance(module, compatible_cls): ++ hidden_states = module(hidden_states, scale) ++ else: ++ hidden_states = module(hidden_states) ++ return hidden_states +diff --git a/easyanimate/models/transformer3d.py b/easyanimate/models/transformer3d.py +index 53d09ea..07a2b63 100644 +--- a/easyanimate/models/transformer3d.py ++++ b/easyanimate/models/transformer3d.py +@@ -501,6 +501,7 @@ class Transformer3DModel(ModelMixin, ConfigMixin): + If `return_dict` is True, an [`~models.transformer_2d.Transformer3DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ ++ hidden_states = hidden_states.to(self.pos_embed.proj.weight.dtype) + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. + # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. + # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. +@@ -716,8 +717,7 @@ class Transformer3DModel(ModelMixin, ConfigMixin): + if model.state_dict()['pos_embed.proj.weight'].size() != state_dict['pos_embed.proj.weight'].size(): + new_shape = model.state_dict()['pos_embed.proj.weight'].size() + if len(new_shape) == 5: +- state_dict['pos_embed.proj.weight'] = state_dict['pos_embed.proj.weight'].unsqueeze(2).expand(new_shape).clone() +- state_dict['pos_embed.proj.weight'][:, :, :-1] = 0 ++ state_dict['pos_embed.proj.weight'] = state_dict['pos_embed.proj.weight'].unsqueeze(2).expand(new_shape) / patch_size + else: + model.state_dict()['pos_embed.proj.weight'][:, :4, :, :] = state_dict['pos_embed.proj.weight'] + model.state_dict()['pos_embed.proj.weight'][:, 4:, :, :] = 0 +diff --git a/easyanimate/utils/IDDIM.py b/easyanimate/utils/IDDIM.py +new file mode 100644 +index 0000000..3b07eb2 +--- /dev/null ++++ b/easyanimate/utils/IDDIM.py +@@ -0,0 +1,44 @@ ++# Modified from OpenAI's diffusion repos ++# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py ++# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion ++# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py ++from . import gaussian_diffusion as gd ++from .respace import SpacedDiffusion, space_timesteps ++ ++ ++def IDDPM( ++ timestep_respacing, ++ noise_schedule="linear", ++ use_kl=False, ++ sigma_small=False, ++ predict_xstart=False, ++ learn_sigma=True, ++ pred_sigma=True, ++ rescale_learned_sigmas=False, ++ diffusion_steps=1000, ++ snr=False, ++ return_startx=False, ++): ++ betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) ++ if use_kl: ++ loss_type = gd.LossType.RESCALED_KL ++ elif rescale_learned_sigmas: ++ loss_type = gd.LossType.RESCALED_MSE ++ else: ++ loss_type = gd.LossType.MSE ++ if timestep_respacing is None or timestep_respacing == "": ++ timestep_respacing = [diffusion_steps] ++ return SpacedDiffusion( ++ use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), ++ betas=betas, ++ model_mean_type=(gd.ModelMeanType.START_X ++ if predict_xstart else gd.ModelMeanType.EPSILON), ++ model_var_type=((gd.ModelVarType.LEARNED_RANGE if learn_sigma else ++ (gd.ModelVarType.FIXED_LARGE ++ if not sigma_small else gd.ModelVarType.FIXED_SMALL)) ++ if pred_sigma else None), ++ loss_type=loss_type, ++ snr=snr, ++ return_startx=return_startx, ++ # rescale_timesteps=rescale_timesteps, ++ ) +diff --git a/infer_lora.py b/infer_lora.py +new file mode 100644 +index 0000000..0d3ec11 +--- /dev/null ++++ b/infer_lora.py +@@ -0,0 +1,269 @@ ++import argparse ++import json ++import os ++import sys ++ ++import torch ++from diffusers import (AutoencoderKL, DDIMScheduler, ++ DPMSolverMultistepScheduler, ++ EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, ++ PNDMScheduler) ++from omegaconf import OmegaConf ++from tqdm import tqdm ++ ++current_file_path = os.path.abspath(__file__) ++project_roots = [ ++ os.path.dirname(current_file_path), ++ os.path.dirname(os.path.dirname(current_file_path)) ++] ++for project_root in project_roots: ++ sys.path.insert(0, project_root) if project_root not in sys.path else None ++ ++from easyanimate.models.autoencoder_magvit import AutoencoderKLMagvit ++from easyanimate.models.transformer3d import Transformer3DModel ++from easyanimate.pipeline.pipeline_easyanimate import EasyAnimatePipeline ++from easyanimate.utils.lora_utils import merge_lora, unmerge_lora ++from easyanimate.utils.utils import save_videos_grid ++ ++ ++def parse_args(): ++ parser = argparse.ArgumentParser( ++ description="Simple example of a training script.") ++ ++ parser.add_argument( ++ "--prompt_info_path", ++ type=str, ++ default=None, ++ help=("The prompts to produce videos."), ++ ) ++ parser.add_argument( ++ "--image_size", ++ type=int, ++ default=256, ++ help=("The size of generated image. It can be 256 or 512."), ++ ) ++ parser.add_argument( ++ "--chunks_num", ++ type=int, ++ default=1, ++ help=("The number of prompts divided for different devices."), ++ ) ++ parser.add_argument( ++ "--chunk_id", ++ type=int, ++ default=0, ++ help=("The chunk_id in current device."), ++ ) ++ parser.add_argument( ++ "--batch_size", ++ type=int, ++ default=8, ++ help=("The batch size in each inferance."), ++ ) ++ parser.add_argument( ++ "--video_num_per_prompt", ++ type=int, ++ default=3, ++ help=("The number of generated videos for each prompt."), ++ ) ++ parser.add_argument("--seed", ++ type=int, ++ default=None, ++ help="A seed for reproducible training.") ++ parser.add_argument( ++ "--config_path", ++ type=str, ++ default=None, ++ help=("The config of the model in inferance."), ++ ) ++ parser.add_argument( ++ "--pretrained_model_name_or_path", ++ type=str, ++ default=None, ++ required=True, ++ help= ++ "Path to pretrained model or model identifier from huggingface.co/models.", ++ ) ++ parser.add_argument( ++ "--sampler_name", ++ type=str, ++ default="DPM++", ++ choices=['Euler', 'Euler A', 'DPM++', 'PNDM', 'DDIM'], ++ help= ++ "Choose the sampler in 'Euler' 'Euler A' 'DPM++' 'PNDM' and 'DDIM'", ++ ) ++ parser.add_argument( ++ "--mixed_precision", ++ type=str, ++ default=None, ++ choices=["no", "fp16", "bf16"], ++ help= ++ ("Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" ++ " 1.10.and an Nvidia Ampere GPU."), ++ ) ++ parser.add_argument( ++ "--transformer_path", ++ type=str, ++ default=None, ++ help= ++ ("If you want to load the weight from other transformers, input its path." ++ ), ++ ) ++ parser.add_argument( ++ "--vae_path", ++ type=str, ++ default=None, ++ help=( ++ "If you want to load the weight from other vaes, input its path."), ++ ) ++ parser.add_argument( ++ "--lora_path", ++ type=str, ++ default=None, ++ help=("The path to the trained lora weight."), ++ ) ++ parser.add_argument( ++ "--save_path", ++ type=str, ++ default=None, ++ help=("The save path of generated videos."), ++ ) ++ ++ args = parser.parse_args() ++ ++ return args ++ ++ ++def get_chunk(prompt_dicts, chunk_id, chunks_num): ++ l = len(prompt_dicts) ++ chunk_len = ((l - 1) // chunks_num) + 1 ++ f = chunk_id * chunk_len ++ t = (chunk_id + 1) * chunk_len ++ return prompt_dicts[f:t] ++ ++ ++def get_batch(arr, batch_size): ++ l = len(arr) ++ batch_num = ((l - 1) // batch_size) + 1 ++ batch_arr = [] ++ for i in range(batch_num): ++ batch_arr.append(arr[i * batch_size:(i + 1) * batch_size]) ++ return batch_arr ++ ++ ++def main(): ++ args = parse_args() ++ ++ video_length = 16 ++ fps = 12 ++ guidance_scale = 6.0 ++ num_inference_steps = 30 ++ lora_weight = 0.55 ++ negative_prompt = "worst quality, normal quality, low quality, low res, blurry, text, watermark, logo, banner, extra digits, cropped, jpeg artifacts, signature, username, error, sketch ,duplicate, ugly, monochrome, horror, geometry" ++ ++ sample_size = [args.image_size, args.image_size] ++ ++ weight_dtype = torch.float32 ++ if args.mixed_precision == "fp16": ++ weight_dtype = torch.float16 ++ elif args.mixed_precision == "bf16": ++ weight_dtype = torch.bfloat16 ++ config = OmegaConf.load(args.config_path) ++ ++ # Get Transformer3d ++ transformer3d = Transformer3DModel.from_pretrained_2d( ++ args.pretrained_model_name_or_path, ++ subfolder="transformer", ++ transformer_additional_kwargs=OmegaConf.to_container( ++ config['transformer_additional_kwargs'])).to(weight_dtype) ++ ++ if args.transformer_path is not None: ++ print(f"From checkpoint: {args.transformer_path}") ++ if args.transformer_path.endswith("safetensors"): ++ from safetensors.torch import load_file, safe_open ++ state_dict = load_file(args.transformer_path) ++ else: ++ state_dict = torch.load(args.transformer_path, map_location="cpu") ++ state_dict = state_dict[ ++ "state_dict"] if "state_dict" in state_dict else state_dict ++ ++ m, u = transformer3d.load_state_dict(state_dict, strict=False) ++ print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") ++ ++ # Get Vae ++ if OmegaConf.to_container(config['vae_kwargs'])['enable_magvit']: ++ Choosen_AutoencoderKL = AutoencoderKLMagvit ++ else: ++ Choosen_AutoencoderKL = AutoencoderKL ++ vae = Choosen_AutoencoderKL.from_pretrained( ++ args.pretrained_model_name_or_path, ++ subfolder="vae", ++ torch_dtype=weight_dtype) ++ ++ if args.vae_path is not None: ++ print(f"From checkpoint: {args.vae_path}") ++ if args.vae_path.endswith("safetensors"): ++ from safetensors.torch import load_file, safe_open ++ state_dict = load_file(args.vae_path) ++ else: ++ state_dict = torch.load(args.vae_path, map_location="cpu") ++ state_dict = state_dict[ ++ "state_dict"] if "state_dict" in state_dict else state_dict ++ ++ m, u = vae.load_state_dict(state_dict, strict=False) ++ print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") ++ ++ # Get Scheduler ++ Choosen_Scheduler = scheduler_dict = { ++ "Euler": EulerDiscreteScheduler, ++ "Euler A": EulerAncestralDiscreteScheduler, ++ "DPM++": DPMSolverMultistepScheduler, ++ "PNDM": PNDMScheduler, ++ "DDIM": DDIMScheduler, ++ }[args.sampler_name] ++ scheduler = Choosen_Scheduler( ++ **OmegaConf.to_container(config['noise_scheduler_kwargs'])) ++ ++ pipeline = EasyAnimatePipeline.from_pretrained( ++ args.pretrained_model_name_or_path, ++ vae=vae, ++ transformer=transformer3d, ++ scheduler=scheduler, ++ torch_dtype=weight_dtype) ++ pipeline.to("cuda") ++ pipeline.enable_model_cpu_offload() ++ ++ generator = torch.Generator(device="cuda").manual_seed(args.seed) ++ ++ if args.lora_path is not None: ++ pipeline = merge_lora(pipeline, args.lora_path, lora_weight) ++ ++ with open(args.prompt_info_path) as f: ++ prompt_dicts = get_chunk(json.load(f), args.chunk_id, args.chunks_num) ++ prompts = [d["prompt_en"] for d in prompt_dicts] ++ prompt_batches = get_batch(prompts, args.batch_size) ++ ++ if not os.path.exists(args.save_path): ++ os.makedirs(args.save_path, exist_ok=True) ++ ++ with torch.no_grad(): ++ for prompts in tqdm(prompt_batches): ++ for i in range(args.video_num_per_prompt): ++ samples = pipeline( ++ prompts, ++ video_length=video_length, ++ negative_prompt=negative_prompt, ++ height=sample_size[0], ++ width=sample_size[1], ++ generator=generator, ++ guidance_scale=guidance_scale, ++ num_inference_steps=num_inference_steps, ++ ).videos ++ for prompt, sample in zip(prompts, samples): ++ video_path = os.path.join(args.save_path, ++ f"{prompt}-{str(i)}.mp4") ++ save_videos_grid(sample.unsqueeze(0), video_path, fps=fps) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/infer_lora.sh b/infer_lora.sh +new file mode 100644 +index 0000000..25b3377 +--- /dev/null ++++ b/infer_lora.sh +@@ -0,0 +1,47 @@ ++#!/bin/bash ++ ++########################################################## ++ ++# model path ++PRETRAINED_MODEL_NAME_OR_PATH=$1 ++TRANSFORMER_PATH=$2 ++LORA_PATH=$3 ++ ++# inferance config ++IMAGE_SIZE=$4 ++PROMPT_INFO_PATH=$5 ++GPU_NUM=$6 ++BATCH_SIZE=$7 ++MIXED_PRECISION=$8 ++VIDEO_NUM_PER_PROMPT=$9 ++SEED=${10} ++ ++# saving config ++OUTPUT_VIDEO_DIR=${11} ++ ++########################################################## ++ ++ ++ ++# run ++for (( i = 0; i < GPU_NUM; i++ )); do ++{ ++ CUDA_VISIBLE_DEVICES=$i python infer_lora.py \ ++ --prompt_info_path=$PROMPT_INFO_PATH \ ++ --config_path "config/easyanimate_video_motion_module_v1.yaml" \ ++ --pretrained_model_name_or_path=$PRETRAINED_MODEL_NAME_OR_PATH \ ++ --transformer_path=$TRANSFORMER_PATH \ ++ --lora_path=$LORA_PATH \ ++ --image_size=$IMAGE_SIZE \ ++ --chunks_num=$GPU_NUM \ ++ --chunk_id=$i \ ++ --batch_size=$BATCH_SIZE \ ++ --video_num_per_prompt=$VIDEO_NUM_PER_PROMPT \ ++ --mixed_precision=$MIXED_PRECISION \ ++ --save_path=$OUTPUT_VIDEO_DIR \ ++ --seed=$SEED ++} & ++done ++ ++wait ++ +diff --git a/scripts/train_lora.py b/scripts/train_lora.py +index 5999f95..7cb46a4 100644 +--- a/scripts/train_lora.py ++++ b/scripts/train_lora.py +@@ -1480,4 +1480,4 @@ def main(): + + + if __name__ == "__main__": +- main() ++ main() +\ No newline at end of file +diff --git a/train_lora.py b/train_lora.py +new file mode 100644 +index 0000000..55fbb68 +--- /dev/null ++++ b/train_lora.py +@@ -0,0 +1,1186 @@ ++"""Modified from https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py ++""" ++#!/usr/bin/env python ++# coding=utf-8 ++# Copyright 2024 The HuggingFace Inc. team. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++ ++import argparse ++import gc ++import logging ++import math ++import os ++import re ++import shutil ++import sys ++ ++import accelerate ++# import easyanimate pakage ++import datasets ++import diffusers ++import numpy as np ++import torch ++import torch.nn.functional as F ++import torch.utils.checkpoint ++import transformers ++from accelerate import Accelerator, DeepSpeedPlugin ++from accelerate.logging import get_logger ++from accelerate.state import AcceleratorState ++from accelerate.utils import ProjectConfiguration, set_seed ++from diffusers import AutoencoderKL, DDPMScheduler ++from diffusers.optimization import get_scheduler ++from diffusers.training_utils import EMAModel ++from diffusers.utils import check_min_version, deprecate, is_wandb_available ++from diffusers.utils.import_utils import is_xformers_available ++from diffusers.utils.torch_utils import is_compiled_module ++from einops import rearrange ++from omegaconf import OmegaConf ++from packaging import version ++from PIL import Image ++from torch.utils.data import RandomSampler ++from torch.utils.tensorboard import SummaryWriter ++from torchvision import transforms ++from tqdm.auto import tqdm ++from transformers import T5EncoderModel, T5Tokenizer ++from transformers.utils import ContextManagers ++ ++current_file_path = os.path.abspath(__file__) ++project_roots = [ ++ os.path.dirname(current_file_path), ++ os.path.dirname(os.path.dirname(current_file_path)) ++] ++for project_root in project_roots: ++ sys.path.insert(0, project_root) if project_root not in sys.path else None ++ ++from easyanimate.data.bucket_sampler import (ASPECT_RATIO_512, ++ ASPECT_RATIO_RANDOM_CROP_512, ++ ASPECT_RATIO_RANDOM_CROP_PROB, ++ AspectRatioBatchSampler, ++ get_closest_ratio) ++from easyanimate.data.dataset_video import VideoDataset ++from easyanimate.models.autoencoder_magvit import AutoencoderKLMagvit ++from easyanimate.models.transformer3d import Transformer3DModel ++from easyanimate.pipeline.pipeline_easyanimate import EasyAnimatePipeline ++from easyanimate.utils.IDDIM import IDDPM ++from easyanimate.utils.lora_utils import (create_network, merge_lora, ++ unmerge_lora) ++from easyanimate.utils.utils import save_videos_grid ++ ++if is_wandb_available(): ++ import wandb ++ ++# Will error if the minimal version of diffusers is not installed. Remove at your own risks. ++check_min_version("0.18.0.dev0") ++ ++logger = get_logger(__name__, log_level="INFO") ++ ++ ++def auto_scale_lr(effective_bs, lr, rule='linear', base_batch_size=256): ++ assert rule in ['linear', 'sqrt'] ++ # scale by world size ++ if rule == 'sqrt': ++ scale_ratio = math.sqrt(effective_bs / base_batch_size) ++ elif rule == 'linear': ++ scale_ratio = effective_bs / base_batch_size ++ lr *= scale_ratio ++ logger.info( ++ f'Automatically adapt lr to {lr:.7f} (using {rule} scaling rule).') ++ return lr ++ ++ ++def log_validation(vae, text_encoder, tokenizer, transformer3d, network, ++ config, args, accelerator, weight_dtype, global_step): ++ # try: ++ logger.info("Running validation... ") ++ ++ transformer3d_val = Transformer3DModel.from_pretrained_2d( ++ args.pretrained_model_name_or_path, ++ subfolder="transformer", ++ transformer_additional_kwargs=OmegaConf.to_container( ++ config['transformer_additional_kwargs'])).to(weight_dtype) ++ transformer3d_val.load_state_dict( ++ accelerator.unwrap_model(transformer3d).state_dict()) ++ ++ pipeline = EasyAnimatePipeline.from_pretrained( ++ args.pretrained_model_name_or_path, ++ vae=accelerator.unwrap_model(vae).to(weight_dtype), ++ text_encoder=accelerator.unwrap_model(text_encoder), ++ tokenizer=tokenizer, ++ transformer=transformer3d_val, ++ torch_dtype=weight_dtype) ++ pipeline = pipeline.to(accelerator.device) ++ pipeline = merge_lora( ++ pipeline, ++ None, ++ 1, ++ accelerator.device, ++ state_dict=accelerator.unwrap_model(network).state_dict(), ++ transformer_only=True) ++ ++ if args.enable_xformers_memory_efficient_attention: ++ pipeline.enable_xformers_memory_efficient_attention() ++ ++ if args.seed is None: ++ generator = None ++ else: ++ generator = torch.Generator(device=accelerator.device).manual_seed( ++ args.seed) ++ ++ images = [] ++ for i in range(len(args.validation_prompts)): ++ with torch.no_grad(): ++ sample = pipeline(args.validation_prompts[i], ++ video_length=args.sample_n_frames, ++ negative_prompt="bad detailed", ++ height=args.sample_size, ++ width=args.sample_size, ++ generator=generator).videos ++ os.makedirs(os.path.join(args.output_dir, "sample"), exist_ok=True) ++ save_videos_grid( ++ sample, ++ os.path.join(args.output_dir, ++ f"sample/sample-{global_step}-{i}.gif")) ++ ++ del pipeline ++ del transformer3d_val ++ gc.collect() ++ torch.cuda.empty_cache() ++ torch.cuda.ipc_collect() ++ ++ return images ++ ++ ++def parse_args(): ++ parser = argparse.ArgumentParser( ++ description="Simple example of a training script.") ++ parser.add_argument( ++ "--input_perturbation", ++ type=float, ++ default=0, ++ help="The scale of input perturbation. Recommended 0.1.") ++ parser.add_argument( ++ "--pretrained_model_name_or_path", ++ type=str, ++ default=None, ++ required=True, ++ help= ++ "Path to pretrained model or model identifier from huggingface.co/models.", ++ ) ++ parser.add_argument( ++ "--revision", ++ type=str, ++ default=None, ++ required=False, ++ help= ++ "Revision of pretrained model identifier from huggingface.co/models.", ++ ) ++ parser.add_argument( ++ "--variant", ++ type=str, ++ default=None, ++ help= ++ "Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", ++ ) ++ parser.add_argument( ++ "--train_data_dir", ++ type=str, ++ default=None, ++ help=("A folder containing the training data. "), ++ ) ++ parser.add_argument( ++ "--train_data_meta", ++ type=str, ++ default=None, ++ help=("A csv containing the training data. "), ++ ) ++ parser.add_argument( ++ "--max_train_samples", ++ type=int, ++ default=None, ++ help= ++ ("For debugging purposes or quicker training, truncate the number of training examples to this " ++ "value if set."), ++ ) ++ parser.add_argument( ++ "--validation_prompts", ++ type=str, ++ default=None, ++ nargs="+", ++ help= ++ ("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`." ++ ), ++ ) ++ parser.add_argument( ++ "--validation_images", ++ type=str, ++ default=None, ++ nargs="+", ++ help= ++ ("A set of images evaluated every `--validation_epochs` and logged to `--report_to`." ++ ), ++ ) ++ parser.add_argument( ++ "--output_dir", ++ type=str, ++ default="sd-model-finetuned", ++ help= ++ "The output directory where the model predictions and checkpoints will be written.", ++ ) ++ parser.add_argument( ++ "--cache_dir", ++ type=str, ++ default=None, ++ help= ++ "The directory where the downloaded models and datasets will be stored.", ++ ) ++ parser.add_argument("--seed", ++ type=int, ++ default=None, ++ help="A seed for reproducible training.") ++ parser.add_argument( ++ "--random_flip", ++ action="store_true", ++ help="whether to randomly flip images horizontally", ++ ) ++ parser.add_argument( ++ "--train_batch_size", ++ type=int, ++ default=16, ++ help="Batch size (per device) for the training dataloader.") ++ parser.add_argument("--vae_mini_batch", ++ type=int, ++ default=32, ++ help="mini batch size for vae.") ++ parser.add_argument("--num_train_epochs", type=int, default=100) ++ parser.add_argument( ++ "--max_train_steps", ++ type=int, ++ default=None, ++ help= ++ "Total number of training steps to perform. If provided, overrides num_train_epochs.", ++ ) ++ parser.add_argument( ++ "--gradient_accumulation_steps", ++ type=int, ++ default=1, ++ help= ++ "Number of updates steps to accumulate before performing a backward/update pass.", ++ ) ++ parser.add_argument( ++ "--gradient_checkpointing", ++ action="store_true", ++ help= ++ "Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", ++ ) ++ parser.add_argument( ++ "--learning_rate", ++ type=float, ++ default=1e-4, ++ help= ++ "Initial learning rate (after the potential warmup period) to use.", ++ ) ++ parser.add_argument( ++ "--scale_lr", ++ action="store_true", ++ default=False, ++ help= ++ "Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", ++ ) ++ parser.add_argument( ++ "--lr_scheduler", ++ type=str, ++ default="constant", ++ help= ++ ('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' ++ ' "constant", "constant_with_warmup"]'), ++ ) ++ parser.add_argument( ++ "--lr_warmup_steps", ++ type=int, ++ default=500, ++ help="Number of steps for the warmup in the lr scheduler.") ++ parser.add_argument( ++ "--use_8bit_adam", ++ action="store_true", ++ help="Whether or not to use 8-bit Adam from bitsandbytes.") ++ parser.add_argument( ++ "--allow_tf32", ++ action="store_true", ++ help= ++ ("Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" ++ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" ++ ), ++ ) ++ parser.add_argument( ++ "--non_ema_revision", ++ type=str, ++ default=None, ++ required=False, ++ help= ++ ("Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or" ++ " remote repository specified with --pretrained_model_name_or_path."), ++ ) ++ parser.add_argument( ++ "--dataloader_num_workers", ++ type=int, ++ default=0, ++ help= ++ ("Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." ++ ), ++ ) ++ parser.add_argument("--adam_beta1", ++ type=float, ++ default=0.9, ++ help="The beta1 parameter for the Adam optimizer.") ++ parser.add_argument("--adam_beta2", ++ type=float, ++ default=0.999, ++ help="The beta2 parameter for the Adam optimizer.") ++ parser.add_argument("--adam_weight_decay", ++ type=float, ++ default=1e-2, ++ help="Weight decay to use.") ++ parser.add_argument("--adam_epsilon", ++ type=float, ++ default=1e-08, ++ help="Epsilon value for the Adam optimizer") ++ parser.add_argument("--max_grad_norm", ++ default=1.0, ++ type=float, ++ help="Max gradient norm.") ++ parser.add_argument("--push_to_hub", ++ action="store_true", ++ help="Whether or not to push the model to the Hub.") ++ parser.add_argument("--hub_token", ++ type=str, ++ default=None, ++ help="The token to use to push to the Model Hub.") ++ parser.add_argument( ++ "--prediction_type", ++ type=str, ++ default=None, ++ help= ++ "The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ++ ) ++ parser.add_argument( ++ "--hub_model_id", ++ type=str, ++ default=None, ++ help= ++ "The name of the repository to keep in sync with the local `output_dir`.", ++ ) ++ parser.add_argument( ++ "--logging_dir", ++ type=str, ++ default="logs", ++ help= ++ ("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" ++ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."), ++ ) ++ parser.add_argument( ++ "--mixed_precision", ++ type=str, ++ default=None, ++ choices=["no", "fp16", "bf16"], ++ help= ++ ("Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" ++ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" ++ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." ++ ), ++ ) ++ parser.add_argument( ++ "--report_to", ++ type=str, ++ default="tensorboard", ++ help= ++ ('The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ++ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ++ ), ++ ) ++ parser.add_argument("--local_rank", ++ type=int, ++ default=-1, ++ help="For distributed training: local_rank") ++ parser.add_argument( ++ "--checkpointing_steps", ++ type=int, ++ default=500, ++ help= ++ ("Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" ++ " training using `--resume_from_checkpoint`."), ++ ) ++ parser.add_argument( ++ "--checkpoints_total_limit", ++ type=int, ++ default=None, ++ help=("Max number of checkpoints to store."), ++ ) ++ parser.add_argument( ++ "--resume_from_checkpoint", ++ type=str, ++ default=None, ++ help= ++ ("Whether training should be resumed from a previous checkpoint. Use a path saved by" ++ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ++ ), ++ ) ++ parser.add_argument("--enable_xformers_memory_efficient_attention", ++ action="store_true", ++ help="Whether or not to use xformers.") ++ parser.add_argument( ++ "--validation_epochs", ++ type=int, ++ default=5, ++ help="Run validation every X epochs.", ++ ) ++ parser.add_argument( ++ "--validation_steps", ++ type=int, ++ default=2000, ++ help="Run validation every X steps.", ++ ) ++ parser.add_argument( ++ "--tracker_project_name", ++ type=str, ++ default="text2image-fine-tune", ++ help= ++ ("The `project_name` argument passed to Accelerator.init_trackers for" ++ " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" ++ ), ++ ) ++ parser.add_argument( ++ "--tracker_experiment_name", ++ type=str, ++ default="experiment1", ++ help= ++ ("The name for wandb init"), ++ ) ++ ++ parser.add_argument( ++ "--rank", ++ type=int, ++ default=128, ++ help=("The dimension of the LoRA update matrices."), ++ ) ++ parser.add_argument( ++ "--network_alpha", ++ type=int, ++ default=64, ++ help=("The dimension of the LoRA update matrices."), ++ ) ++ parser.add_argument( ++ "--train_text_encoder", ++ action="store_true", ++ help= ++ "Whether to train the text encoder. If set, the text encoder should be float32 precision.", ++ ) ++ parser.add_argument("--snr_loss", ++ action="store_true", ++ help="Whether or not to use snr_loss.") ++ parser.add_argument( ++ "--random_ratio_crop", ++ action="store_true", ++ help="Whether enable random ratio crop sample in datasets.") ++ parser.add_argument( ++ "--random_frame_crop", ++ action="store_true", ++ help="Whether enable random frame crop sample in datasets.") ++ parser.add_argument( ++ "--train_sampling_steps", ++ type=int, ++ default=1000, ++ help="Run train_sampling_steps.", ++ ) ++ parser.add_argument( ++ "--sample_size", ++ type=int, ++ default=256, ++ help="Sample size of the video.", ++ ) ++ parser.add_argument( ++ "--sample_stride", ++ type=int, ++ default=4, ++ help="Sample stride of the video.", ++ ) ++ parser.add_argument( ++ "--sample_n_frames", ++ type=int, ++ default=4, ++ help="Num frame of video.", ++ ) ++ parser.add_argument( ++ "--config_path", ++ type=str, ++ default=None, ++ help=("The config of the model in training."), ++ ) ++ parser.add_argument( ++ "--transformer_path", ++ type=str, ++ default=None, ++ help= ++ ("If you want to load the weight from other transformers, input its path." ++ ), ++ ) ++ parser.add_argument( ++ "--vae_path", ++ type=str, ++ default=None, ++ help=( ++ "If you want to load the weight from other vaes, input its path."), ++ ) ++ ++ parser.add_argument('--tokenizer_max_length', ++ type=int, ++ default=120, ++ help='Max length of tokenizer') ++ ++ args = parser.parse_args() ++ env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) ++ if env_local_rank != -1 and env_local_rank != args.local_rank: ++ args.local_rank = env_local_rank ++ ++ # default to using the same revision for the non-ema model if not specified ++ if args.non_ema_revision is None: ++ args.non_ema_revision = args.revision ++ ++ return args ++ ++ ++def main(): ++ args = parse_args() ++ ++ if args.report_to == "wandb" and args.hub_token is not None: ++ raise ValueError( ++ "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token." ++ " Please use `huggingface-cli login` to authenticate with the Hub." ++ ) ++ ++ if args.report_to == "wandb": ++ wandb.init(project=args.tracker_project_name, name=args.tracker_experiment_name) ++ ++ if args.non_ema_revision is not None: ++ deprecate( ++ "non_ema_revision!=None", ++ "0.15.0", ++ message= ++ ("Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to" ++ " use `--variant=non_ema` instead."), ++ ) ++ logging_dir = os.path.join(args.output_dir, args.logging_dir) ++ ++ config = OmegaConf.load(args.config_path) ++ accelerator_project_config = ProjectConfiguration( ++ project_dir=args.output_dir, logging_dir=logging_dir) ++ ++ deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, ++ gradient_accumulation_steps=1) ++ accelerator = Accelerator( ++ gradient_accumulation_steps=args.gradient_accumulation_steps, ++ mixed_precision=args.mixed_precision, ++ log_with=args.report_to, ++ project_config=accelerator_project_config, ++ deepspeed_plugin=deepspeed_plugin) ++ if accelerator.is_main_process: ++ writer = SummaryWriter(log_dir=logging_dir) ++ ++ # Make one log on every process with the configuration for debugging. ++ logging.basicConfig( ++ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", ++ datefmt="%m/%d/%Y %H:%M:%S", ++ level=logging.INFO, ++ ) ++ logger.info(accelerator.state, main_process_only=False) ++ if accelerator.is_local_main_process: ++ datasets.utils.logging.set_verbosity_warning() ++ transformers.utils.logging.set_verbosity_warning() ++ diffusers.utils.logging.set_verbosity_info() ++ else: ++ datasets.utils.logging.set_verbosity_error() ++ transformers.utils.logging.set_verbosity_error() ++ diffusers.utils.logging.set_verbosity_error() ++ ++ # If passed along, set the training seed now. ++ if args.seed is not None: ++ set_seed(args.seed) ++ ++ # Handle the repository creation ++ if accelerator.is_main_process: ++ if args.output_dir is not None: ++ os.makedirs(args.output_dir, exist_ok=True) ++ ++ # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora transformer3d) to half-precision ++ # as these weights are only used for inference, keeping weights in full precision is not required. ++ weight_dtype = torch.float32 ++ if accelerator.mixed_precision == "fp16": ++ weight_dtype = torch.float16 ++ args.mixed_precision = accelerator.mixed_precision ++ elif accelerator.mixed_precision == "bf16": ++ weight_dtype = torch.bfloat16 ++ args.mixed_precision = accelerator.mixed_precision ++ ++ # Load scheduler, tokenizer and models. ++ # noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") ++ train_diffusion = IDDPM(str(args.train_sampling_steps), ++ learn_sigma=True, ++ pred_sigma=True, ++ snr=args.snr_loss) ++ tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_name_or_path, ++ subfolder="tokenizer", ++ revision=args.revision) ++ ++ def deepspeed_zero_init_disabled_context_manager(): ++ """ ++ returns either a context list that includes one that will disable zero.Init or an empty context list ++ """ ++ deepspeed_plugin = AcceleratorState( ++ ).deepspeed_plugin if accelerate.state.is_initialized() else None ++ if deepspeed_plugin is None: ++ return [] ++ ++ return [deepspeed_plugin.zero3_init_context_manager(enable=False)] ++ ++ if OmegaConf.to_container(config['vae_kwargs'])['enable_magvit']: ++ Choosen_AutoencoderKL = AutoencoderKLMagvit ++ else: ++ Choosen_AutoencoderKL = AutoencoderKL ++ ++ # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3. ++ # For this to work properly all models must be run through `accelerate.prepare`. But accelerate ++ # will try to assign the same optimizer with the same weights to all models during ++ # `deepspeed.initialize`, which of course doesn't work. ++ # ++ # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2 ++ # frozen models from being partitioned during `zero.Init` which gets called during ++ # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding ++ # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded. ++ with ContextManagers(deepspeed_zero_init_disabled_context_manager()): ++ text_encoder = T5EncoderModel.from_pretrained( ++ args.pretrained_model_name_or_path, ++ subfolder="text_encoder", ++ revision=args.revision, ++ variant=args.variant, ++ torch_dtype=weight_dtype) ++ vae = Choosen_AutoencoderKL.from_pretrained( ++ args.pretrained_model_name_or_path, ++ subfolder="vae", ++ revision=args.revision, ++ variant=args.variant) ++ ++ transformer3d = Transformer3DModel.from_pretrained_2d( ++ args.pretrained_model_name_or_path, ++ subfolder="transformer", ++ transformer_additional_kwargs=OmegaConf.to_container( ++ config['transformer_additional_kwargs'])) ++ ++ # Freeze vae and text_encoder and set transformer3d to trainable ++ vae.requires_grad_(False) ++ text_encoder.requires_grad_(False) ++ transformer3d.requires_grad_(False) ++ ++ # Lora will work with this... ++ network = create_network( ++ 1.0, ++ args.rank, ++ args.network_alpha, ++ text_encoder, ++ transformer3d, ++ neuron_dropout=None, ++ add_lora_in_attn_temporal=True, ++ ) ++ network.apply_to(text_encoder, transformer3d, args.train_text_encoder, ++ True) ++ ++ if args.transformer_path is not None: ++ print(f"From checkpoint: {args.transformer_path}") ++ if args.transformer_path.endswith("safetensors"): ++ from safetensors.torch import load_file, safe_open ++ state_dict = load_file(args.transformer_path) ++ else: ++ state_dict = torch.load(args.transformer_path, map_location="cpu") ++ state_dict = state_dict[ ++ "state_dict"] if "state_dict" in state_dict else state_dict ++ ++ m, u = transformer3d.load_state_dict(state_dict, strict=False) ++ print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") ++ assert len(u) == 0 ++ ++ if args.vae_path is not None: ++ print(f"From checkpoint: {args.vae_path}") ++ if args.vae_path.endswith("safetensors"): ++ from safetensors.torch import load_file, safe_open ++ state_dict = load_file(args.vae_path) ++ else: ++ state_dict = torch.load(args.vae_path, map_location="cpu") ++ state_dict = state_dict[ ++ "state_dict"] if "state_dict" in state_dict else state_dict ++ ++ m, u = vae.load_state_dict(state_dict, strict=False) ++ print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") ++ assert len(u) == 0 ++ ++ if args.enable_xformers_memory_efficient_attention: ++ if is_xformers_available(): ++ import xformers ++ ++ xformers_version = version.parse(xformers.__version__) ++ if xformers_version == version.parse("0.0.16"): ++ logger.warn( ++ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." ++ ) ++ transformer3d.enable_xformers_memory_efficient_attention() ++ else: ++ raise ValueError( ++ "xformers is not available. Make sure it is installed correctly" ++ ) ++ ++ # `accelerate` 0.16.0 will have better support for customized saving ++ if version.parse(accelerate.__version__) >= version.parse("0.16.0"): ++ # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format ++ def save_model_hook(models, weights, output_dir): ++ if accelerator.is_main_process: ++ models[0].save_pretrained( ++ os.path.join(output_dir, "transformer")) ++ weights.pop() ++ ++ def load_model_hook(models, input_dir): ++ for i in range(len(models)): ++ # pop models so that they are not loaded again ++ model = models.pop() ++ ++ # load diffusers style into model ++ load_model = Transformer3DModel.from_pretrained_2d( ++ input_dir, ++ subfolder="transformer", ++ transformer_additional_kwargs=OmegaConf.to_container( ++ config['transformer_additional_kwargs'])) ++ model.register_to_config(**load_model.config) ++ ++ model.load_state_dict(load_model.state_dict()) ++ del load_model ++ ++ accelerator.register_save_state_pre_hook(save_model_hook) ++ accelerator.register_load_state_pre_hook(load_model_hook) ++ ++ if args.gradient_checkpointing: ++ transformer3d.enable_gradient_checkpointing() ++ ++ # Enable TF32 for faster training on Ampere GPUs, ++ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices ++ if args.allow_tf32: ++ torch.backends.cuda.matmul.allow_tf32 = True ++ ++ if args.scale_lr: ++ args.learning_rate = auto_scale_lr( ++ args.gradient_accumulation_steps * args.train_batch_size * ++ accelerator.num_processes, args.learning_rate, 'sqrt') ++ ++ # Initialize the optimizer ++ if args.use_8bit_adam: ++ try: ++ import bitsandbytes as bnb ++ except ImportError: ++ raise ImportError( ++ "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" ++ ) ++ ++ optimizer_cls = bnb.optim.AdamW8bit ++ else: ++ optimizer_cls = torch.optim.AdamW ++ ++ logging.info("Add network parameters") ++ trainable_params = list( ++ filter(lambda p: p.requires_grad, network.parameters())) ++ trainable_params_optim = network.prepare_optimizer_params( ++ args.learning_rate / 2, args.learning_rate, args.learning_rate) ++ ++ optimizer = optimizer_cls( ++ trainable_params_optim, ++ lr=args.learning_rate, ++ betas=(args.adam_beta1, args.adam_beta2), ++ weight_decay=args.adam_weight_decay, ++ eps=args.adam_epsilon, ++ ) ++ ++ train_dataset = VideoDataset(args.train_data_meta, ++ args.train_data_dir, ++ sample_size=args.sample_size, ++ sample_stride=args.sample_stride, ++ sample_n_frames=args.sample_n_frames, ++ enable_inpaint=False) ++ ++ # DataLoaders creation: ++ train_dataloader = torch.utils.data.DataLoader( ++ train_dataset, ++ shuffle=True, ++ persistent_workers=True if args.dataloader_num_workers != 0 else False, ++ batch_size=args.train_batch_size, ++ num_workers=args.dataloader_num_workers, ++ ) ++ ++ # Scheduler and math around the number of training steps. ++ overrode_max_train_steps = False ++ num_update_steps_per_epoch = math.ceil( ++ len(train_dataloader) / args.gradient_accumulation_steps) ++ if args.max_train_steps is None: ++ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch ++ overrode_max_train_steps = True ++ ++ lr_scheduler = get_scheduler( ++ args.lr_scheduler, ++ optimizer=optimizer, ++ num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes, ++ num_training_steps=args.max_train_steps * accelerator.num_processes, ++ ) ++ ++ network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( ++ network, optimizer, train_dataloader, lr_scheduler) ++ ++ # Move text_encode and vae to gpu and cast to weight_dtype ++ transformer3d.to(accelerator.device, dtype=weight_dtype) ++ text_encoder.to(accelerator.device) ++ vae.to(accelerator.device, dtype=weight_dtype) ++ ++ # We need to recalculate our total training steps as the size of the training dataloader may have changed. ++ num_update_steps_per_epoch = math.ceil( ++ len(train_dataloader) / args.gradient_accumulation_steps) ++ if overrode_max_train_steps: ++ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch ++ # Afterwards we recalculate our number of training epochs ++ args.num_train_epochs = math.ceil(args.max_train_steps / ++ num_update_steps_per_epoch) ++ ++ # We need to initialize the trackers we use, and also store our configuration. ++ # The trackers initializes automatically on the main process. ++ if accelerator.is_main_process: ++ tracker_config = dict(vars(args)) ++ tracker_config.pop("validation_prompts") ++ tracker_config.pop("validation_images") ++ accelerator.init_trackers(args.tracker_project_name, tracker_config) ++ ++ # Function for unwrapping if model was compiled with `torch.compile`. ++ def unwrap_model(model): ++ model = accelerator.unwrap_model(model) ++ model = model._orig_mod if is_compiled_module(model) else model ++ return model ++ ++ # Train! ++ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps ++ total_compute_budget = args.num_train_epochs * len( ++ train_dataset) * args.sample_n_frames * args.sample_size**2 ++ ++ logger.info("***** Running training *****") ++ logger.info(f" Num examples = {len(train_dataset)}") ++ logger.info(f" Num Epochs = {args.num_train_epochs}") ++ logger.info( ++ f" Instantaneous batch size per device = {args.train_batch_size}") ++ logger.info( ++ f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ++ ) ++ logger.info( ++ f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") ++ logger.info(f" Total optimization steps = {args.max_train_steps}") ++ logger.info(f" Total compute budget = {total_compute_budget}") ++ ++ global_step = 0 ++ first_epoch = 0 ++ ++ # Potentially load in the weights and states from a previous save ++ if args.resume_from_checkpoint: ++ if args.resume_from_checkpoint != "latest": ++ path = os.path.basename(args.resume_from_checkpoint) ++ else: ++ # Get the most recent checkpoint ++ dirs = os.listdir(args.output_dir) ++ dirs = [d for d in dirs if d.startswith("checkpoint")] ++ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) ++ path = dirs[-1] if len(dirs) > 0 else None ++ ++ if path is None: ++ accelerator.print( ++ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." ++ ) ++ args.resume_from_checkpoint = None ++ initial_global_step = 0 ++ else: ++ accelerator.print(f"Resuming from checkpoint {path}") ++ accelerator.load_state(os.path.join(args.output_dir, path)) ++ global_step = int(path.split("-")[1]) ++ ++ initial_global_step = global_step ++ first_epoch = global_step // num_update_steps_per_epoch ++ ++ else: ++ initial_global_step = 0 ++ ++ # function for saving/removing ++ def save_model(ckpt_file, unwrapped_nw): ++ os.makedirs(args.output_dir, exist_ok=True) ++ accelerator.print(f"\nsaving checkpoint: {ckpt_file}") ++ unwrapped_nw.save_weights(ckpt_file, weight_dtype, None) ++ ++ progress_bar = tqdm( ++ range(0, args.max_train_steps), ++ initial=initial_global_step, ++ desc="Steps", ++ # Only show the progress bar once on each machine. ++ disable=not accelerator.is_local_main_process, ++ ) ++ ++ log_validation(vae, text_encoder, tokenizer, transformer3d, network, ++ config, args, accelerator, weight_dtype, global_step) ++ ++ for epoch in range(first_epoch, args.num_train_epochs): ++ train_loss = 0.0 ++ for step, batch in enumerate(train_dataloader): ++ # Data batch sanity check ++ if epoch == first_epoch and step == 0: ++ pixel_values, texts = batch['pixel_values'].cpu( ++ ), batch['text'] ++ pixel_values = rearrange(pixel_values, ++ "b f c h w -> b c f h w") ++ os.makedirs(os.path.join(args.output_dir, "sanity_check"), ++ exist_ok=True) ++ for idx, (pixel_value, ++ text) in enumerate(zip(pixel_values, texts)): ++ pixel_value = pixel_value[None, ...] ++ save_videos_grid( ++ pixel_value, ++ f"{args.output_dir}/sanity_check/{'-'.join(text.replace('/', '').split()[:10]) if not text == '' else f'{global_step}-{idx}'}.gif", ++ rescale=True) ++ ++ with accelerator.accumulate(network): ++ # Convert images to latent space ++ pixel_values = batch["pixel_values"].to(weight_dtype) ++ ++ if args.random_frame_crop: ++ select_frames = [ ++ _tmp ++ for _tmp in list(range(4, args.sample_n_frames + 4, 4)) ++ ] ++ select_frames_prob = [_tmp**2 for _tmp in select_frames] ++ select_frames_prob = np.array(select_frames_prob) / sum( ++ select_frames_prob) ++ select_frames_prob = np.array(select_frames) / sum( ++ select_frames) ++ ++ temp_n_frames = np.random.choice(select_frames, ++ p=select_frames_prob) ++ pixel_values = pixel_values[:, :temp_n_frames, :, :] ++ ++ video_length = pixel_values.shape[1] ++ with torch.no_grad(): ++ # This way will be slow when batch grows up ++ # pixel_values = rearrange(pixel_values, "b f c h w -> (b f) c h w") ++ # latents = vae.encode(pixel_values.to(dtype=weight_dtype)).latent_dist ++ # latents = latents.sample() ++ # latents = rearrange(latents, "(b f) c h w -> b c f h w", f=video_length) ++ ++ # Convert images to latent space ++ if vae.quant_conv.weight.ndim == 5: ++ # This way is quicker when batch grows up ++ pixel_values = rearrange(pixel_values, ++ "b f c h w -> b c f h w") ++ mini_batch = 21 ++ new_pixel_values = [] ++ for i in range(0, pixel_values.shape[2], mini_batch): ++ pixel_values_bs = pixel_values[:, :, i:i + ++ mini_batch, :, :] ++ pixel_values_bs = vae.encode(pixel_values_bs)[0] ++ pixel_values_bs = pixel_values_bs.sample() ++ new_pixel_values.append(pixel_values_bs) ++ # if i == pixel_values.shape[2] - 1: ++ # break ++ # with torch.no_grad(): ++ # pixel_values_bs = pixel_values[:, :, i: i + mini_batch + 1, :, :].to(dtype=weight_dtype) ++ # pixel_values_bs = vae.encode(pixel_values_bs)[0] ++ # pixel_values_bs = pixel_values_bs.sample() ++ # new_pixel_values.append(pixel_values_bs if i == 0 else pixel_values_bs[:, :, 1:, :, :]) ++ latents = torch.cat(new_pixel_values, dim=2) ++ else: ++ # This way is quicker when batch grows up ++ pixel_values = rearrange(pixel_values, ++ "b f c h w -> (b f) c h w") ++ bs = args.vae_mini_batch ++ new_pixel_values = [] ++ for i in range(0, pixel_values.shape[0], bs): ++ pixel_values_bs = pixel_values[i:i + bs] ++ pixel_values_bs = vae.encode( ++ pixel_values_bs.to( ++ dtype=weight_dtype)).latent_dist ++ pixel_values_bs = pixel_values_bs.sample() ++ new_pixel_values.append(pixel_values_bs) ++ latents = torch.cat(new_pixel_values, dim=0) ++ latents = rearrange(latents, ++ "(b f) c h w -> b c f h w", ++ f=video_length) ++ ++ latents = latents * 0.18215 ++ ++ prompt_ids = tokenizer(batch['text'], ++ max_length=args.tokenizer_max_length, ++ padding="max_length", ++ add_special_tokens=True, ++ truncation=True, ++ return_tensors="pt") ++ encoder_hidden_states = text_encoder( ++ prompt_ids.input_ids.to(latents.device), ++ attention_mask=prompt_ids.attention_mask.to( ++ latents.device), ++ return_dict=False)[0] ++ ++ bsz = latents.shape[0] ++ # Sample a random timestep for each image ++ timesteps = torch.randint(0, ++ args.train_sampling_steps, (bsz, ), ++ device=latents.device) ++ timesteps = timesteps.long() ++ ++ added_cond_kwargs = {"resolution": None, "aspect_ratio": None} ++ loss_term = train_diffusion.training_losses( ++ transformer3d, ++ latents, ++ timesteps, ++ model_kwargs=dict( ++ encoder_hidden_states=encoder_hidden_states, ++ encoder_attention_mask=prompt_ids.attention_mask.to( ++ latents.device), ++ added_cond_kwargs=added_cond_kwargs, ++ inpaint_latents=None, ++ return_dict=False)) ++ loss = loss_term['loss'].mean() ++ ++ # Gather the losses across all processes for logging (if we use distributed training). ++ avg_loss = accelerator.gather( ++ loss.repeat(args.train_batch_size)).mean() ++ train_loss += avg_loss.item( ++ ) / args.gradient_accumulation_steps ++ ++ # Backpropagate ++ accelerator.backward(loss) ++ if accelerator.sync_gradients: ++ accelerator.clip_grad_norm_(trainable_params, ++ args.max_grad_norm) ++ optimizer.step() ++ lr_scheduler.step() ++ optimizer.zero_grad() ++ ++ # Checks if the accelerator has performed an optimization step behind the scenes ++ if accelerator.sync_gradients: ++ progress_bar.update(1) ++ global_step += 1 ++ accelerator.log({"train_loss": train_loss}, step=global_step) ++ train_loss = 0.0 ++ ++ if global_step % args.checkpointing_steps == 0: ++ if accelerator.is_main_process: ++ # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` ++ if args.checkpoints_total_limit is not None: ++ checkpoints = os.listdir(args.output_dir) ++ checkpoints = [ ++ d for d in checkpoints ++ if d.startswith("checkpoint") ++ ] ++ checkpoints = sorted( ++ checkpoints, ++ key=lambda x: int(x.split("-")[1])) ++ ++ # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints ++ if len(checkpoints ++ ) >= args.checkpoints_total_limit: ++ num_to_remove = len( ++ checkpoints ++ ) - args.checkpoints_total_limit + 1 ++ removing_checkpoints = checkpoints[ ++ 0:num_to_remove] ++ ++ logger.info( ++ f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" ++ ) ++ logger.info( ++ f"removing checkpoints: {', '.join(removing_checkpoints)}" ++ ) ++ ++ for removing_checkpoint in removing_checkpoints: ++ removing_checkpoint = os.path.join( ++ args.output_dir, removing_checkpoint) ++ shutil.rmtree(removing_checkpoint) ++ ++ safetensor_save_path = os.path.join( ++ args.output_dir, ++ f"checkpoint-{global_step}.safetensors") ++ accelerator_save_path = os.path.join( ++ args.output_dir, f"checkpoint-{global_step}") ++ save_model(safetensor_save_path, ++ accelerator.unwrap_model(network)) ++ #if args.save_state: ++ #accelerator.save_state(accelerator_save_path) ++ #logger.info(f"Saved state to {accelerator_save_path}") ++ ++ if accelerator.is_main_process: ++ if args.validation_prompts is not None and global_step % args.validation_steps == 0: ++ log_validation( ++ vae, ++ text_encoder, ++ tokenizer, ++ transformer3d, ++ network, ++ config, ++ args, ++ accelerator, ++ weight_dtype, ++ global_step, ++ ) ++ ++ logs = { ++ "step_loss": loss.detach().item(), ++ "lr": lr_scheduler.get_last_lr()[0] ++ } ++ progress_bar.set_postfix(**logs) ++ ++ if global_step >= args.max_train_steps: ++ break ++ ++ if accelerator.is_main_process: ++ if args.validation_prompts is not None and epoch % args.validation_epochs == 0: ++ log_validation( ++ vae, ++ text_encoder, ++ tokenizer, ++ transformer3d, ++ network, ++ config, ++ args, ++ accelerator, ++ weight_dtype, ++ global_step, ++ ) ++ ++ # Create the pipeline using the trained modules and save it. ++ accelerator.wait_for_everyone() ++ if accelerator.is_main_process: ++ safetensor_save_path = os.path.join( ++ args.output_dir, f"checkpoint-{global_step}.safetensors") ++ accelerator_save_path = os.path.join(args.output_dir, ++ f"checkpoint-{global_step}") ++ save_model(safetensor_save_path, accelerator.unwrap_model(network)) ++ #if args.save_state: ++ #accelerator.save_state(accelerator_save_path) ++ #logger.info(f"Saved state to {accelerator_save_path}") ++ ++ accelerator.end_training() ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/train_lora.sh b/train_lora.sh +new file mode 100644 +index 0000000..03bd609 +--- /dev/null ++++ b/train_lora.sh +@@ -0,0 +1,57 @@ ++########################################################## ++# pretrained model path ++PRETRAINED_MODEL_NAME_OR_PATH=$1 ++TRANSFORMER_PATH=$2 ++ ++# dataset path ++DATASET_NAME=$3 ++DATASET_META_NAME=$4 ++ ++# training config ++SAMPLE_SIZE=$5 ++MIXED_PRECISION=$6 ++BATCH_SIZE_PER_GPU=$7 ++GRADIENT_ACCUMULATION_STEPS=$8 ++NUM_TRAIN_EPOCHS=$9 ++DATALOADER_NUM_WORKERS=${10} ++SEED=${11} ++ ++# saving config ++OUTPUT_DIR=${12} ++CHECKPOINTING_STEPS=1000 ++VALIDATION_STEPS=500 ++VALIDATION_PROMPTS="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff\'s precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures." ++ ++# tracer config ++PROJECT_NAME=${13} ++EXPERIMENT_NAME=${14} ++########################################################## ++ ++ ++accelerate launch --mixed_precision=$MIXED_PRECISION train_lora.py \ ++ --config_path "config/easyanimate_video_motion_module_v1.yaml" \ ++ --pretrained_model_name_or_path=$PRETRAINED_MODEL_NAME_OR_PATH \ ++ --transformer_path=$TRANSFORMER_PATH \ ++ --train_data_dir=$DATASET_NAME \ ++ --train_data_meta=$DATASET_META_NAME \ ++ --sample_size=$SAMPLE_SIZE \ ++ --sample_n_frames=16 \ ++ --sample_stride=2 \ ++ --train_batch_size=$BATCH_SIZE_PER_GPU \ ++ --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS \ ++ --num_train_epochs=$NUM_TRAIN_EPOCHS \ ++ --dataloader_num_workers=$DATALOADER_NUM_WORKERS \ ++ --checkpointing_steps=$CHECKPOINTING_STEPS \ ++ --validation_prompts="$VALIDATION_PROMPTS" \ ++ --output_dir=$OUTPUT_DIR \ ++ --validation_steps=$VALIDATION_STEPS \ ++ --learning_rate=2e-05 \ ++ --seed=$SEED \ ++ --enable_xformers_memory_efficient_attention \ ++ --gradient_checkpointing \ ++ --adam_weight_decay=3e-2 \ ++ --adam_epsilon=1e-10 \ ++ --vae_mini_batch=1 \ ++ --report_to="wandb" \ ++ --tracker_project_name=$PROJECT_NAME \ ++ --tracker_experiment_name=$EXPERIMENT_NAME diff --git a/thirdparty/models/setup_easyanimate.sh b/thirdparty/models/setup_easyanimate.sh new file mode 100755 index 000000000..f0eabd88b --- /dev/null +++ b/thirdparty/models/setup_easyanimate.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +export THIRD_PARTY_DIR=$(cd $(dirname $0); pwd) +export EASYANIMATE_DIR=${THIRD_PARTY_DIR}/EasyAnimate + +# setup easyanimate +echo "> setup easyanimate ..." +git clone https://github.com/aigc-apps/EasyAnimate.git +cd $EASYANIMATE_DIR +git reset b54412ceb0af6a06bf907e049920f18508c862f1 --hard +git apply ${THIRD_PARTY_DIR}/patch/easyanimate.diff diff --git a/tools/distributed_deduplication/README.md b/tools/distributed_deduplication/README.md index 23464af32..6f69ba354 100644 --- a/tools/distributed_deduplication/README.md +++ b/tools/distributed_deduplication/README.md @@ -4,7 +4,7 @@ Help you reproduce and apply fuzzy deduplication to your web datasets similar to **The General Description about Fuzzy Deduplication**: -The fuzzy deduplication method here mainly refer to the fuzzy deduplication method mentioned in the Appendix A of [GPT-3 paper](https://arxiv.org/pdf/2005.14165.pdf). +The fuzzy deduplication method here mainly refer to the fuzzy deduplication method mentioned in the Appendix A of [GPT-3 paper](https://arxiv.org/pdf/2005.14165.pdf). > To further improve model quality and prevent overfitting (which becomes increasingly important as model capacity increases), we fuzzily deduplicated documents (i.e. removed documents with high overlap with other documents) within each dataset using Spark’s MinHashLSH implementation with 10 hashes, using **the same features as were used for classification above**. We also fuzzily removed WebText from Common Crawl. Overall this decreased dataset size by an average of 10%. diff --git a/tools/distributed_deduplication/README_ZH.md b/tools/distributed_deduplication/README_ZH.md index 6a74006e7..0aa8eb92e 100644 --- a/tools/distributed_deduplication/README_ZH.md +++ b/tools/distributed_deduplication/README_ZH.md @@ -2,7 +2,7 @@ 复现与GPT-3论文相似的模糊去重方法并应用到您的Web数据集。 **模糊去重的一般描述**: -这里的模糊去重方法主要指的是 [GPT-3论文](https://arxiv.org/pdf/2005.14165.pdf)附录A中提到的模糊去重方法。 +这里的模糊去重方法主要指的是 [GPT-3论文](https://arxiv.org/pdf/2005.14165.pdf)附录A中提到的模糊去重方法。 > 为了进一步提高模型质量并防止过拟合(随着模型容量的增加越来越重要),我们使用Spark的MinHashLSH实现对每个数据集中的文档进行了模糊去重(即移除了与其他文档高度重合的文档),使用了10个哈希,使用的**特征与上面用于分类的特征相同**。我们还从Common Crawl中模糊移除了WebText。总体而言,这使数据集的大小平均减少了10%。 正如论文中提到的,使用的特征与前文描述的质量分类器([quality_classifier tools](../quality_classifier/README.md))中所用的一致。 整个工具包基于PySpark。 diff --git a/tools/hpo/README.md b/tools/hpo/README.md index b840636f1..37c114d61 100644 --- a/tools/hpo/README.md +++ b/tools/hpo/README.md @@ -1,8 +1,8 @@ # Hyper-parameter Optimization for Data Recipe ## Auto-HPO based on 3-Sigma principles -A simple automatic hyper-parameter optimization method for data recipes is to assume that outlier data is harmful to training. -We thus can introduce the 3-sigma principle to automatically determine the hyper-parameters and filter the data. +A simple automatic hyper-parameter optimization method for data recipes is to assume that outlier data is harmful to training. +We thus can introduce the 3-sigma principle to automatically determine the hyper-parameters and filter the data. Specifically, assuming that a certain analysis dimension of the original data obeys a normal distribution and has random errors, we can set the upper and lower bounds of the filtering OP in this dimension to three times the standard deviation based on the statistics produced by the DataJuicer's Analyzer. @@ -11,15 +11,15 @@ $$P(|x-\mu| > 3\sigma) \leq 0.003$$ To automate this process, we provide the tool which can be used as follows: ```shell # cd tools/hpo -# usage 1: do not save the refined recipe -python execute_hpo_3sigma.py --config +# usage 1: do not save the refined recipe +python execute_hpo_3sigma.py --config # usage 2: save the refined recipe at the given path -python execute_hpo_3sigma.py --config --path_3sigma_recipe +python execute_hpo_3sigma.py --config --path_3sigma_recipe # e.g., usage 1 -python execute_hpo_3sigma.py --config configs/process.yaml +python execute_hpo_3sigma.py --config configs/process.yaml # e.g., usage 2 -python execute_hpo_3sigma.py --config configs/process.yaml --path_3sigma_recipe configs/process_3sigma.yaml +python execute_hpo_3sigma.py --config configs/process.yaml --path_3sigma_recipe configs/process_3sigma.yaml ``` ## Auto-HPO with WandB @@ -62,7 +62,7 @@ python execute_hpo_wandb.py --config configs/process.yaml --hpo_config configs/q ``` For the configuration for data recipe, i.e., ``, -please see more details in our [guidance](https://github.com/alibaba/data-juicer#build-up-config-files). As for the configuration +please see more details in our [guidance](https://github.com/alibaba/data-juicer#build-up-config-files). As for the configuration for HPO, i.e., ``, please refer to sweep [guidance](https://docs.wandb.ai/guides/sweeps/define-sweep-configuration). We provide an illustrative objective "quality_score" in `hpo/objects.py`, diff --git a/tools/hpo/README_ZH.md b/tools/hpo/README_ZH.md index de7a46d59..6584926b4 100644 --- a/tools/hpo/README_ZH.md +++ b/tools/hpo/README_ZH.md @@ -9,19 +9,19 @@ $$P(|x-\mu| > 3\sigma) \leq 0.003$$ 为了自动化该过程,我们提供了相应工具: ```shell # cd tools/hpo -# usage 1: do not save the refined recipe -python execute_hpo_3sigma.py --config +# usage 1: do not save the refined recipe +python execute_hpo_3sigma.py --config # usage 2: save the refined recipe at the given path -python execute_hpo_3sigma.py --config --path_3sigma_recipe +python execute_hpo_3sigma.py --config --path_3sigma_recipe # e.g., usage 1 -python execute_hpo_3sigma.py --config configs/process.yaml +python execute_hpo_3sigma.py --config configs/process.yaml # e.g., usage 2 -python execute_hpo_3sigma.py --config configs/process.yaml --path_3sigma_recipe configs/process_3sigma.yaml +python execute_hpo_3sigma.py --config configs/process.yaml --path_3sigma_recipe configs/process_3sigma.yaml ``` -## 基于WandB进行Auto-HPO +## 基于WandB进行Auto-HPO 我们将自动化 HPO (hyper-parameters optimization) 工具 WandB [Sweep](https://docs.wandb.ai/guides/sweeps) 结合到 Data-Juicer 中,以简化改良数据处理超参数的过程。 @@ -60,7 +60,7 @@ python execute_hpo_wandb.py --config configs/process.yaml --hpo_config configs/q ``` 对于数据菜谱的配置,即``, -请参阅我们的 [指南](https://github.com/alibaba/data-juicer/blob/main/README_ZH.md#%E6%9E%84%E5%BB%BA%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6) +请参阅我们的 [指南](https://github.com/alibaba/data-juicer/blob/main/README_ZH.md#%E6%9E%84%E5%BB%BA%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6) 获取更多详细信息。 对于HPO的配置,即``,请参阅Sweep提供的 [指南](https://docs.wandb.ai/guides/sweeps/define-sweep-configuration) 。 diff --git a/tools/mm_eval/README.md b/tools/mm_eval/vbench_metrics/README.md similarity index 100% rename from tools/mm_eval/README.md rename to tools/mm_eval/vbench_metrics/README.md diff --git a/tools/mm_eval/README_ZH.md b/tools/mm_eval/vbench_metrics/README_ZH.md similarity index 100% rename from tools/mm_eval/README_ZH.md rename to tools/mm_eval/vbench_metrics/README_ZH.md diff --git a/tools/mm_eval/vbench_metrics/VBench_mini_info.json b/tools/mm_eval/vbench_metrics/VBench_mini_info.json new file mode 100644 index 000000000..e1937a62e --- /dev/null +++ b/tools/mm_eval/vbench_metrics/VBench_mini_info.json @@ -0,0 +1,109 @@ +[ + { + "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time", + "dimension": [ + "temporal_flickering" + ] + }, + { + "prompt_en": "a person and a sink", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "person and sink" + } + } + }, + { + "prompt_en": "A person is sailing", + "dimension": [ + "human_action" + ] + }, + { + "prompt_en": "a giraffe running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ] + }, + { + "prompt_en": "a toothbrush", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "toothbrush" + } + } + }, + { + "prompt_en": "a white vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + } + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + } + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus", + "dimension": [ + "temporal_style" + ] + }, + { + "prompt_en": "Yellow flowers swing in the wind", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ] + }, + { + "prompt_en": "windmill", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "windmill" + } + } + } + }, + { + "prompt_en": "a surfboard on the bottom of skis, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "surfboard", + "object_b": "skis", + "relationship": "on the bottom of" + } + } + } + } +] diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md index 78e382ceb..60ff084b8 100644 --- a/tools/multimodal/README.md +++ b/tools/multimodal/README.md @@ -15,27 +15,27 @@ python tools/multimodal/absolute_path_to_relative_path.py --help ## Dataset Format Conversion -Due to large format diversity among different multimodal datasets and works, -Data-Juicer propose a novel intermediate text-based interleaved data format for multimodal dataset, which +Due to large format diversity among different multimodal datasets and works, +Data-Juicer propose a novel intermediate text-based interleaved data format for multimodal dataset, which is based on chunk-wise formats such MMC4 dataset. -In the Data-Juicer format, a multimodal sample or document is based on a text, +In the Data-Juicer format, a multimodal sample or document is based on a text, which consists of several text chunks. Each chunk is a semantic unit, and all the multimodal information in a chunk should talk about the same thing and be aligned with each other. Here is a multimodal sample example in Data-Juicer format below. - It includes 4 chunks split by the special token `<|__dj__eoc|>`. -- In addition to texts, there are 3 other modalities: images, audios, videos. +- In addition to texts, there are 3 other modalities: images, audios, videos. They are stored on the disk and their paths are listed in the corresponding first-level fields in the sample. -- Other modalities are represented as special tokens in the text (e.g. image -- `<__dj__image>`). -The special tokens of each modality correspond to the paths in the order of appearance. +- Other modalities are represented as special tokens in the text (e.g. image -- `<__dj__image>`). +The special tokens of each modality correspond to the paths in the order of appearance. (e.g. the two image tokens in the third chunk are images of antarctica_map and europe_map respectively) -- There could be multiple types of modalities and multiple modality special tokens in a single chunk, -and they are semantically aligned with each other and text in this chunk. +- There could be multiple types of modalities and multiple modality special tokens in a single chunk, +and they are semantically aligned with each other and text in this chunk. The position of special tokens can be random in a chunk. (In general, they are usually before or after the text.) -- For multimodal samples, unlike text-only samples, the computed stats for other +- For multimodal samples, unlike text-only samples, the computed stats for other modalities could be a list of stats for the list of multimodal data (e.g. image_widths in this sample). ```python @@ -71,7 +71,7 @@ modalities could be a list of stats for the list of multimodal data (e.g. image_ } ``` -According to this format, Data-Juicer provided several dataset format conversion tools for some popular multimodal +According to this format, Data-Juicer provided several dataset format conversion tools for some popular multimodal works. These tools consist of two types: @@ -97,24 +97,24 @@ For all tools, you can run the following command to find out the usage of them: python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --help ``` -Before using these tools, you might need to take a glance at the reference -materials in the above tables for each format, to better know the detail format +Before using these tools, you might need to take a glance at the reference +materials in the above tables for each format, to better know the detail format information and understand the arguments for each tool. ### Notice -There might be some tiny differences after converting a source dataset to Data-Juicer -format and convert it back. However, these differences have nearly no effects -on the semantics of datasets. Here we will show these tiny differences in detail +There might be some tiny differences after converting a source dataset to Data-Juicer +format and convert it back. However, these differences have nearly no effects +on the semantics of datasets. Here we will show these tiny differences in detail for each source format. #### LLaVA-like -The format of LLaVA-like datasets are defined [here](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format). -Although it's simple, but in real scenarios, there might be some slight variations +The format of LLaVA-like datasets are defined [here](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format). +Although it's simple, but in real scenarios, there might be some slight variations in some samples. -Here we take the [visual instruction tuning dataset](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json) as an example, -and show how these variations influence the dataset format. The table below -shows the number of different samples between the original dataset and the +Here we take the [visual instruction tuning dataset](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json) as an example, +and show how these variations influence the dataset format. The table below +shows the number of different samples between the original dataset and the dataset after processing. There are 665,298 samples in the original dataset. | process | # of diff. | @@ -124,11 +124,11 @@ dataset after processing. There are 665,298 samples in the original dataset. | 3. strip whitespaces before and after values of conversations in the original dataset | 40,688 | | 4. add `'model': ''` fields in the converted dataset | 1 | -It's worth noticing that processes 2-4 won't influence the semantics of sample conversations in the dataset. -Thus we think the dataset after conversion can align with the original dataset. +It's worth noticing that processes 2-4 won't influence the semantics of sample conversations in the dataset. +Thus we think the dataset after conversion can align with the original dataset. Finally, the only 1 sample is different because there are some extra useless fields ("text", "markdown") -in the conversations, which is shown below. But the "from" and "value" fields are the same between original +in the conversations, which is shown below. But the "from" and "value" fields are the same between original and converted datasets, so we can regard this sample is aligned with the original one as well. ```json @@ -167,7 +167,7 @@ and converted datasets, so we can regard this sample is aligned with the origina The format of MMC4-like datasets are defined [here](https://github.com/allenai/mmc4#documents). Except `image_info` and `text_list`, which are used when converting them to Data-Juicer format, there is an important field `similarity_matrix`. Similarity matrix is -a matrix of shape `len(image_info) x len(text_list)`, which means it highly depends on the numbers of images and text sentences and their +a matrix of shape `len(image_info) x len(text_list)`, which means it highly depends on the numbers of images and text sentences and their orders. However, when processing such datasets with Data-Juicer, images or sentences might be removed from a sample by Filters, and they could be @@ -192,7 +192,7 @@ The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four "id": "2219", "duration": 14.1424375, "audio": "wav_path", - "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] } # converted dataset @@ -208,7 +208,7 @@ The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four "audio": "wav_path", "download_link": "http://soundbible.com/grab.php?id=2219&type=wav", "category": "", - "tags": "" }] + "tags": "" }] } ``` @@ -224,28 +224,28 @@ They all obey the `` format, where the `video_id` is #### Youku-mPLUG-like -The Youku-mPLUG dataset contains 4 types of format: pretrain, classification, retrieval, captioning. +The Youku-mPLUG dataset contains 4 types of format: pretrain, classification, retrieval, captioning. They are slightly different from each other in field name or other attributes, but all of them obey the `` format. #### InternVid-like The InternVid dataset contains 4 fields: -- `YoutubeID`: the Youtube ID of the video used in the sample. +- `YoutubeID`: the Youtube ID of the video used in the sample. We suppose that users have downloaded these videos already and this field is replaced with its storage path. -- `Start_timestamp`: the start timestamp in string of the video clip for the +- `Start_timestamp`: the start timestamp in string of the video clip for the corresponding caption. -- `End_timestamp`: the end timestamp in string of the video clip for the +- `End_timestamp`: the end timestamp in string of the video clip for the corresponding caption. - `Caption`: the corresponding caption for the video clip. -As we can see, the caption in this dataset corresponds to the video clip -specified by the start/end timestamps instead of the whole video. So the -conversion tool will cut the specified video clip for you if the argument +As we can see, the caption in this dataset corresponds to the video clip +specified by the start/end timestamps instead of the whole video. So the +conversion tool will cut the specified video clip for you if the argument `cut_videos` is set to True. You can cut before conversion by yourself as well. #### MSR-VTT-like MSR-VTT dataset contains multiple fields, here we use 2 fields: -- `video_id`: the video file name without suffix used in the sample. +- `video_id`: the video file name without suffix used in the sample. We suppose that users have downloaded these videos already。 - `caption`: the corresponding caption for the video. diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md index dc3f50831..07afd10cb 100644 --- a/tools/multimodal/README_ZH.md +++ b/tools/multimodal/README_ZH.md @@ -163,7 +163,7 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel "id": "2219", "duration": 14.1424375, "audio": "wav_path", - "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] + "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}] } # 转换后数据集 @@ -179,7 +179,7 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel "audio": "wav_path", "download_link": "http://soundbible.com/grab.php?id=2219&type=wav", "category": "", - "tags": "" }] + "tags": "" }] } ``` @@ -211,4 +211,4 @@ InternVid数据集包括4个字段: #### 类MSR-VTT格式 MSR-VTT数据集包含多个字段,主要用到2个字段: - `video_id`: 样本中使用的视频的文件名,未包含文件后缀。我们假设用户已经下载了这些视频。 -- `caption`: 与视频对应的caption。 \ No newline at end of file +- `caption`: 与视频对应的caption。 diff --git a/tools/sandbox_starter.py b/tools/sandbox_starter.py index 0bb30209f..ca3bb37d1 100644 --- a/tools/sandbox_starter.py +++ b/tools/sandbox_starter.py @@ -37,6 +37,11 @@ def init_sandbox_configs(args=None): default='experiment1', help='For wandb tracer name.') + parser.add_argument('--work_dir', + type=str, + default='./outputs/hello_world', + help='Default output dir of meta infomations.') + parser.add_argument( '--hpo_config', type=str, @@ -90,19 +95,19 @@ def specify_jobs_configs(cfg): :param cfg: the original config :return: a dict of different configs. """ - cfg.probe_job_configs = [ - specify_job_configs(job_cfg) for job_cfg in cfg.probe_job_configs - ] - cfg.refine_recipe_job_configs = [ - specify_job_configs(job_cfg) - for job_cfg in cfg.refine_recipe_job_configs - ] - cfg.execution_job_configs = [ - specify_job_configs(job_cfg) for job_cfg in cfg.execution_job_configs - ] - cfg.evaluation_job_configs = [ - specify_job_configs(job_cfg) for job_cfg in cfg.evaluation_job_configs - ] + + def configs_to_job_list(cfgs): + job_cfgs = [] + if cfgs: + job_cfgs = [specify_job_configs(job_cfg) for job_cfg in cfgs] + return job_cfgs + + cfg.probe_job_configs = configs_to_job_list(cfg.probe_job_configs) + cfg.refine_recipe_job_configs = configs_to_job_list( + cfg.refine_recipe_job_configs) + cfg.execution_job_configs = configs_to_job_list(cfg.execution_job_configs) + cfg.evaluation_job_configs = configs_to_job_list( + cfg.evaluation_job_configs) return cfg