+ add docs

modelscope · Sep 9, 2024 · 08e76c8 · 08e76c8
1 parent 1dd0fb8
commit 08e76c8
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 1 deletion.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -122,6 +122,8 @@ process:
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
+  - image_tagging_mapper:                                   # Mapper to generate image tags.
+      tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -258,10 +260,12 @@ process:
       show_progress: false                                    # whether to show progress from scenedetect
   - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
       hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
+      tag_field_name: '__dj__video_audio_tags__'              # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
       mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
   - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
   # Filter ops
@@ -473,6 +477,7 @@ process:
       contain: any                                            # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
       frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - words_num_filter:                                       # filter text with number of words out of specific range
       lang: en                                                # sample in which language

diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py
@@ -35,6 +35,7 @@ def __init__(self,
                  contain: str = 'any',
                  frame_sampling_method: str = 'all_keyframes',
                  frame_num: PositiveInt = 3,
+                 tag_field_name=Fields.video_frame_tags,
                  any_or_all: str = 'any',
                  *args,
                  **kwargs):
@@ -59,6 +60,8 @@ def __init__(self,
             the first and the last frames will be extracted. If it's larger
             than 2, in addition to the first and the last frames, other frames
             will be extracted uniformly within the video duration.
+        :param tag_field_name: the field name to store the tags. It's
+            "__dj__video_frame_tags__" in default.
         :param any_or_all: keep this sample with 'any' or 'all' strategy of
             all videos. 'any': keep this sample if any videos meet the
             condition. 'all': keep this sample only if all videos meet the
@@ -80,10 +83,12 @@ def __init__(self,
         self.tags = set([tag.lower() for tag in tags])
         self.contain_any = (contain == 'any')
         self.any = (any_or_all == 'any')
+        self.tag_field_name = tag_field_name
         self.tagging_producer = VideoTaggingFromFramesMapper(
             frame_sampling_method=frame_sampling_method,
             frame_num=frame_num,
             accelerator=self.accelerator,
+            tag_field_name=self.tag_field_name,
         )
 
     def compute_stats(self, sample, rank=None, context=False):
@@ -93,7 +98,7 @@ def compute_stats(self, sample, rank=None, context=False):
         return sample
 
     def process(self, sample, rank=None):
-        video_tags = sample[Fields.video_frame_tags]
+        video_tags = sample[self.tag_field_name]
         if len(video_tags) <= 0:
             return True
 

diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
@@ -55,6 +55,8 @@ def __init__(self,
             the first and the last frames will be extracted. If it's larger
             than 2, in addition to the first and the last frames, other frames
             will be extracted uniformly within the video duration.
+        :param tag_field_name: the field name to store the tags. It's
+            "__dj__video_frame_tags__" in default.
         :param args: extra args
         :param kwargs: extra args
         """