Link vtt file to CaptionFile model

learningequality · Oct 17, 2023 · e982829 · e982829
1 parent bd831b8
commit e982829
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 43 deletions.
diff --git a/contentcuration/contentcuration/migrations/0145_captioncue_captionfile.py b/contentcuration/contentcuration/migrations/0145_captioncue_captionfile.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.14 on 2023-09-23 08:23
+# Generated by Django 3.2.14 on 2023-10-17 06:55
 
 import contentcuration.models
 from django.db import migrations, models
@@ -18,7 +18,9 @@ class Migration(migrations.Migration):
             fields=[
                 ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)),
                 ('file_id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32)),
+                ('modified', models.DateTimeField(auto_now=True, verbose_name='modified')),
                 ('language', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='caption_file', to='contentcuration.language')),
+                ('output_file', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='contentcuration.file')),
             ],
             options={
                 'unique_together': {('file_id', 'language')},

diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py
@@ -2065,12 +2065,16 @@ class CaptionFile(models.Model):
     """
     Represents a caption file record.
 
-    - file_id: The identifier of related file in Google Cloud Storage.
+    - file_id: The identifier of related Video/Audio File object.
     - language: The language of the caption file.
+    - output_file: The FK to the associated generated VTT File object.
     """
     id = UUIDField(primary_key=True, default=uuid.uuid4)
     file_id = UUIDField(default=uuid.uuid4, max_length=36)
     language = models.ForeignKey(Language, related_name="caption_file", on_delete=models.CASCADE)
+    modified = models.DateTimeField(auto_now=True, verbose_name="modified")
+    output_file = models.ForeignKey('File', null=True, blank=True,
+                                    on_delete=models.SET_NULL)
 
     class Meta:
         unique_together = ['file_id', 'language']

diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py
@@ -168,15 +168,59 @@ def create_kolibri_license_object(ccnode):
         license_description=ccnode.license.license_description if use_license_description else ccnode.license_description
     )
 
+def create_webvtt_file(ccnode: ccmodels.ContentNode,
+                       vtt_content: str,
+                       caption_file: ccmodels.CaptionFile,
+                       user_id: int = None) -> None:
+    """Create a WebVTT file and associate it with a CaptionFile.
+
+    :param ccnode: The ContentNode associated with the WebVTT file.
+    :param vtt_content: The content of the WebVTT file as a string (UTF-8 encoded).
+    :param caption_file: A CaptionFile to associate with the WebVTT file.
+    :param user_id: The ID of the user creating the WebVTT file (optional).
+    """
+    logging.debug(f"Creating WebVTT for Node {ccnode.title}")
+    filename = "{0}_{1}.{ext}".format(ccnode.title, caption_file.language, ext=file_formats.VTT)
+    temppath = None
+    try:
+        with tempfile.NamedTemporaryFile(suffix="vtt", delete=False) as tempf:
+            temppath = tempf.name
+            tempf.write(vtt_content)
+            file_size = tempf.tell()
+            tempf.flush()
+
+            vtt_file_obj = ccmodels.File.objects.create(
+                file_on_disk=File(open(temppath, 'rb'), name=filename),
+                contentnode=ccnode,
+                file_format_id=file_formats.VTT,
+                preset_id=format_presets.VIDEO_SUBTITLE,
+                original_filename=filename,
+                file_size=file_size,
+                uploaded_by_id=user_id,
+                language=caption_file.language,
+            )
+            logging.debug("Created VTT for {0} with checksum {1}".format(ccnode.title, vtt_file_obj.checksum))
+
+            caption_file.output_file = vtt_file_obj
+            caption_file.save()
+    except Exception as e:
+        logging.error(f"Error creating VTT file for {ccnode.title}: {str(e)}")
+    finally:
+        temppath and os.unlink(temppath)
+
 def generate_webvtt_file(caption_cues: QuerySet[ccmodels.CaptionCue]) -> str:
-    """:returns: webvtt_content as string"""
+    """ Generate the content of a WebVTT file based on CaptionCue's.
+
+    :param: caption_cues: QuerySet of CaptionCues to include in the WebVTT.
+    :returns: The WebVTT content as a UTF-8 encoded string.
+    """
     webvtt_content = "WEBVTT\n\n"
     for cue in caption_cues.order_by('starttime'):
         st = str(timedelta(seconds=cue.starttime))
         et = str(timedelta(seconds=cue.endtime))
         webvtt_content += f"{st}.000 --> {et}.000\n"
         webvtt_content += f"- {cue.text}\n\n"
-    return webvtt_content
+    return webvtt_content.encode('utf-8')
 
 def increment_channel_version(channel):
     channel.version += 1
@@ -276,13 +320,14 @@ def recurse_nodes(self, node, inherited_fields):  # noqa C901
             elif node.kind_id == content_kinds.SLIDESHOW:
                 create_slideshow_manifest(node, user_id=self.user_id)
             elif node.kind_id in [content_kinds.AUDIO, content_kinds.VIDEO]:
-                file_ids = node.files.all().values_list('id')
-                caption_files_queryset = ccmodels.CaptionFile.objects.filter(file_id__in=file_ids)
-                for caption_file in caption_files_queryset:
-                    caption_cues = ccmodels.CaptionCue.objects.filter(caption_file=caption_file)
-                    if caption_cues.exists():
-                        lang, webvtt_content = caption_file.language, generate_webvtt_file(caption_cues)
-                        create_webvtt(node, webvtt_content, lang, self.user_id)
+                if node.changed:
+                    file_ids = node.files.all().values_list('id')
+                    caption_files = ccmodels.CaptionFile.objects.filter(file_id__in=file_ids)
+                    for cf in caption_files:
+                        vtt_file = cf.output_file
+                        vtt_content = generate_webvtt_file(caption_cues=cf.caption_cue.all())
+                        if vtt_file is None or vtt_file.modified < cf.modified:
+                            create_webvtt_file(node, vtt_content, cf, self.user_id)
             elif node.kind_id == content_kinds.TOPIC:
                 for child in node.children.all():
                     self.recurse_nodes(child, metadata)
@@ -492,31 +537,6 @@ def create_associated_file_objects(kolibrinode, ccnode):
             local_file=kolibrilocalfilemodel,
         )
 
-def create_webvtt(ccnode: ccmodels.ContentNode, vtt_content: str, language: str, user_id: int = None) -> None:
-    logging.debug(f"Creating WebVTT for Node {ccnode.title}")
-    filename = "{0}_{1}.{ext}".format(ccnode.title, language, ext=file_formats.VTT)
-    temppath = None
-    try:
-        with tempfile.NamedTemporaryFile(suffix="vtt", delete=False) as tempf:
-            temppath = tempf.name
-            tempf.write(vtt_content.encode('utf-8'))
-            file_size = tempf.tell()
-            tempf.flush()
-
-            file_obj = ccmodels.File.objects.create(
-                file_on_disk=File(open(temppath, 'rb'), name=filename),
-                contentnode=ccnode,
-                file_format_id=file_formats.VTT,
-                preset_id=format_presets.VIDEO_SUBTITLE,
-                original_filename=filename,
-                file_size=file_size,
-                uploaded_by_id=user_id,
-                language=language,
-            )
-            logging.debug("Created vtt for {0} with checksum {1}".format(ccnode.title, file_obj.checksum))
-    finally:
-        temppath and os.unlink(temppath)
-
 def create_perseus_exercise(ccnode, kolibrinode, exercise_data, user_id=None):
     logging.debug("Creating Perseus Exercise for Node {}".format(ccnode.title))
     filename = "{0}.{ext}".format(ccnode.title, ext=file_formats.PERSEUS)

diff --git a/requirements-dev.in b/requirements-dev.in
@@ -26,3 +26,4 @@ git+https://github.com/someshchaturvedi/customizable-django-profiler.git#customi
 tabulate==0.8.2
 fonttools
 minio==7.1.1
+transformers==4.29.2
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.9
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
 #
 #    pip-compile requirements-dev.in
 #
@@ -77,8 +77,12 @@ drf-yasg==1.20.0
     # via -r requirements-dev.in
 faker==0.9.1
     # via mixer
-filelock==3.4.1
-    # via virtualenv
+filelock==3.12.4
+    # via
+    #   -c requirements.txt
+    #   huggingface-hub
+    #   transformers
+    #   virtualenv
 flake8==3.4.1
     # via -r requirements-dev.in
 flask==2.0.3
@@ -92,6 +96,10 @@ flask-cors==3.0.10
     # via locust
 fonttools==4.40.0
     # via -r requirements-dev.in
+fsspec==2023.9.2
+    # via
+    #   -c requirements.txt
+    #   huggingface-hub
 gevent==21.12.0
     # via
     #   geventhttpclient
@@ -100,6 +108,10 @@ geventhttpclient==2.0.9
     # via locust
 greenlet==1.1.2
     # via gevent
+huggingface-hub==0.17.3
+    # via
+    #   -c requirements.txt
+    #   transformers
 identify==2.4.4
     # via pre-commit
 idna==2.10
@@ -144,12 +156,18 @@ nodeenv==1.6.0
     # via
     #   -r requirements-dev.in
     #   pre-commit
+numpy==1.26.0
+    # via
+    #   -c requirements.txt
+    #   transformers
 packaging==20.9
     # via
     #   -c requirements.txt
     #   build
     #   drf-yasg
+    #   huggingface-hub
     #   pytest
+    #   transformers
 pep517==0.12.0
     # via build
 pip-tools==6.8.0
@@ -209,17 +227,26 @@ pytz==2022.1
     # via
     #   -c requirements.txt
     #   django
-pyyaml==6.0
+pyyaml==6.0.1
     # via
+    #   -c requirements.txt
     #   aspy-yaml
+    #   huggingface-hub
     #   pre-commit
+    #   transformers
 pyzmq==23.1.0
     # via locust
+regex==2023.10.3
+    # via
+    #   -c requirements.txt
+    #   transformers
 requests==2.25.1
     # via
     #   -c requirements.txt
     #   coreapi
+    #   huggingface-hub
     #   locust
+    #   transformers
 roundrobin==0.0.2
     # via locust
 ruamel-yaml==0.17.21
@@ -247,6 +274,10 @@ tblib==1.7.0
     # via django-concurrent-test-helper
 text-unidecode==1.2
     # via faker
+tokenizers==0.13.3
+    # via
+    #   -c requirements.txt
+    #   transformers
 toml==0.10.2
     # via
     #   pre-commit
@@ -256,8 +287,20 @@ tomli==1.2.3
     #   build
     #   coverage
     #   pep517
-typing-extensions==4.5.0
-    # via locust
+tqdm==4.66.1
+    # via
+    #   -c requirements.txt
+    #   huggingface-hub
+    #   transformers
+transformers==4.29.2
+    # via
+    #   -c requirements.txt
+    #   -r requirements-dev.in
+typing-extensions==4.8.0
+    # via
+    #   -c requirements.txt
+    #   huggingface-hub
+    #   locust
 uritemplate==3.0.1
     # via
     #   coreapi