From 1866eb71945f86012aea5e57858dad995cfba67e Mon Sep 17 00:00:00 2001
From: Calvin Walton <calvin.walton@kepstin.ca>
Date: Fri, 25 Sep 2020 12:36:24 -0400
Subject: [PATCH] Perform captions generation in UTF-16 encoding

The indexes returned in recording events from BBB refer to positions
within a UTF-16 encoded string. Rather than attempt to untangle this in
the server (which might have a performance cost), it's easier to switch
the caption processing code to operate in UTF-16 encoding as well to
make it work consistently.

The PyICU library provides a UnicodeString type which is a UTF-16 string
similar to Java and JavaScript, but which supports all the python
indexing methods. It's fairly straightforwards to swap it in in place of
the types used previously, and works natively as an input to the ICU
line break iterator too.

Fixes #10531
---
 .../core/scripts/utils/gen_webvtt             | 207 ++++++++++--------
 1 file changed, 113 insertions(+), 94 deletions(-)

diff --git a/record-and-playback/core/scripts/utils/gen_webvtt b/record-and-playback/core/scripts/utils/gen_webvtt
index 0221a4ccb5..d9ab086ffa 100755
--- a/record-and-playback/core/scripts/utils/gen_webvtt
+++ b/record-and-playback/core/scripts/utils/gen_webvtt
@@ -22,7 +22,7 @@ from lxml import etree
 from collections import deque
 from fractions import Fraction
 import io
-from icu import Locale, BreakIterator
+from icu import Locale, BreakIterator, UnicodeString
 import unicodedata
 import html
 import logging
@@ -35,12 +35,14 @@ logging.basicConfig(level=logging.DEBUG)
 
 logger = logging.getLogger(__name__)
 
+
 def webvtt_timestamp(ms):
     frac_s = int(ms % 1000)
     s = int(ms / 1000 % 60)
     m = int(ms / 1000 / 60 % 60)
     h = int(ms / 1000 / 60 / 60)
-    return '{:02}:{:02}:{:02}.{:03}'.format(h, m, s, frac_s)
+    return "{:02}:{:02}:{:02}.{:03}".format(h, m, s, frac_s)
+
 
 class CaptionLine:
     def __init__(self):
@@ -48,10 +50,11 @@ class CaptionLine:
         self.start_time = 0
         self.end_time = 0
 
+
 class Caption:
     def __init__(self, locale):
         self.locale = locale
-        self.text = list()
+        self.text = UnicodeString()
         self.timestamps = list()
         self._del_timestamps = list()
 
@@ -63,24 +66,30 @@ class Caption:
             else:
                 del_timestamp = self.timestamps[i]
             self._del_timestamps[i] = del_timestamp
-            logger.debug("Removing text %s at %d:%d, del_ts: %d",
-                    repr(''.join(self.text[i:j])), i, j, del_timestamp)
+            logger.debug(
+                "Removing text %s at %d:%d, del_ts: %d",
+                repr(str(self.text[i:j])),
+                i,
+                j,
+                del_timestamp,
+            )
 
         if len(text) > 0:
-            logger.debug("Inserting text %s at %d:%d, ts: %d",
-                    repr(''.join(text)), i, j, timestamp)
+            logger.debug(
+                "Inserting text %s at %d:%d, ts: %d", repr(str(text)), i, j, timestamp
+            )
 
             if i < len(self.timestamps) and timestamp > self.timestamps[i]:
                 timestamp = self._del_timestamps[i]
                 if timestamp is None:
                     if i > 0:
-                        timestamp = self.timestamps[i-1]
+                        timestamp = self.timestamps[i - 1]
                     else:
                         timestamp = self.timestamps[i]
                 logger.debug("Out of order timestamps, using ts: %d", timestamp)
 
         self._del_timestamps[i:j] = [del_timestamp] * len(text)
-        if (i < len(self._del_timestamps)):
+        if i < len(self._del_timestamps):
             self._del_timestamps[i] = del_timestamp
 
         self.text[i:j] = text
@@ -94,9 +103,9 @@ class Caption:
         stop_pos = 0
         start_pos = None
         for event in events:
-            if event['name'] == 'record_status':
-                status = event['status']
-                timestamp = event['timestamp']
+            if event["name"] == "record_status":
+                status = event["status"]
+                timestamp = event["timestamp"]
 
                 if status and not record:
                     record = True
@@ -106,13 +115,14 @@ class Caption:
                     # Find the position of the first character after recording
                     # started
                     start_pos = stop_pos
-                    while start_pos < len(self.timestamps) and \
-                            self.timestamps[start_pos] < start_ts:
+                    while (
+                        start_pos < len(self.timestamps)
+                        and self.timestamps[start_pos] < start_ts
+                    ):
                         start_pos += 1
 
-                    logger.debug("Replacing characters %d:%d",
-                            stop_pos, start_pos)
-                    self.text[stop_pos:start_pos] = ["\n"]
+                    logger.debug("Replacing characters %d:%d", stop_pos, start_pos)
+                    self.text[stop_pos:start_pos] = "\n"
                     self.timestamps[stop_pos:start_pos] = [stop_ts - ts_offset]
 
                     start_pos = stop_pos + 1
@@ -130,8 +140,10 @@ class Caption:
                     # Find the position of the first character after recording
                     # stopped, and apply ts offsets
                     stop_pos = start_pos
-                    while stop_pos < len(self.timestamps) and \
-                            self.timestamps[stop_pos] < stop_ts:
+                    while (
+                        stop_pos < len(self.timestamps)
+                        and self.timestamps[stop_pos] < stop_ts
+                    ):
                         self.timestamps[stop_pos] -= ts_offset
                         stop_pos += 1
 
@@ -149,17 +161,16 @@ class Caption:
         # Apply all of the caption events to generate the full text
         # with per-character timestamps
         for event in events:
-            if event['name'] == 'edit_caption_history':
-                locale = event['locale']
-                i = event['start_index']
-                j = event['end_index']
-                timestamp = event['timestamp']
-                text = event['text']
+            if event["name"] == "edit_caption_history":
+                locale = event["locale"]
+                i = event["start_index"]
+                j = event["end_index"]
+                timestamp = event["timestamp"]
+                text = UnicodeString(event["text"])
 
                 caption = captions.get(locale)
                 if caption is None:
-                    logger.info("Started caption stream for locale '%s'",
-                            locale)
+                    logger.info("Started caption stream for locale '%s'", locale)
                     captions[locale] = caption = cls(locale)
 
                 caption.apply_edit(i, j, timestamp, text)
@@ -175,15 +186,12 @@ class Caption:
     def split_lines(self, max_length=32):
         lines = list()
 
-        str_text = "".join(self.text)
-
         locale = Locale(self.locale)
-        logger.debug("Using locale %s for word-wrapping",
-                locale.getDisplayName(locale))
+        logger.debug("Using locale %s for word-wrapping", locale.getDisplayName(locale))
 
         break_iter = BreakIterator.createLineInstance(locale)
-        break_iter.setText(str_text)
-        
+        break_iter.setText(self.text)
+
         line = CaptionLine()
         line_start = 0
         prev_break = 0
@@ -194,39 +202,45 @@ class Caption:
             status = break_iter.getRuleStatus()
 
             line_end = next_break
-            while line_end > line_start and ( \
-                    self.text[line_end-1].isspace() or \
-                    unicodedata.category(self.text[line_end-1]) in ['Cc', 'Mn']
-                    ):
+            logger.debug("text len: %d, line end: %d", len(self.text), line_end)
+            while line_end > line_start and (
+                self.text[line_end - 1].isspace()
+                or unicodedata.category(self.text[line_end - 1]) in ["Cc", "Mn"]
+            ):
                 line_end -= 1
 
             do_break = False
             text_section = unicodedata.normalize(
-                    'NFC', "".join(self.text[line_start:line_end]))
+                "NFC", str(self.text[line_start:line_end])
+            )
             timestamps_section = self.timestamps[line_start:next_break]
             start_time = min(timestamps_section)
             end_time = max(timestamps_section)
             if len(text_section) > max_length:
                 if prev_break == line_start:
                     # Over-long string. Just chop it into bits
-                    line_end = next_break = prev_break + max_length
+                    next_break = prev_break + max_length
+                    continue
                 else:
                     next_break = prev_break
                     do_break = True
 
             else:
                 # Status [100,200) indicates a required (hard) line break
-                if next_break >= len(self.text) or \
-                        (status >= 100 and status < 200):
+                if next_break >= len(self.text) or (status >= 100 and status < 200):
                     line.text = text_section
                     line.start_time = start_time
                     line.end_time = end_time
                     do_break = True
 
             if do_break:
-                logger.debug("text section %d -> %d (%d): %s",
-                        line.start_time, line.end_time,
-                        len(line.text), repr(line.text))
+                logger.debug(
+                    "text section %d -> %d (%d): %s",
+                    line.start_time,
+                    line.end_time,
+                    len(line.text),
+                    repr(line.text),
+                )
                 lines.append(line)
                 line = CaptionLine()
                 line_start = next_break
@@ -242,7 +256,7 @@ class Caption:
 
     def write_webvtt(self, f):
         # Write magic
-        f.write("WEBVTT\n\n".encode('utf-8'))
+        f.write("WEBVTT\n\n".encode("utf-8"))
 
         lines = self.split_lines()
 
@@ -297,49 +311,48 @@ class Caption:
                 if next_start_time - end_time < 500:
                     end_time = next_start_time
 
-            f.write("{} --> {}\n".format(
-                    webvtt_timestamp(start_time),
-                    webvtt_timestamp(end_time)
-                    ).encode('utf-8'))
-            f.write(html.escape(text, quote=False).encode('utf-8'))
-            f.write("\n\n".encode('utf-8'))
+            f.write(
+                "{} --> {}\n".format(
+                    webvtt_timestamp(start_time), webvtt_timestamp(end_time)
+                ).encode("utf-8")
+            )
+            f.write(html.escape(text, quote=False).encode("utf-8"))
+            f.write("\n\n".encode("utf-8"))
 
     def caption_desc(self):
         locale = Locale(self.locale)
-        return {
-                "locale": self.locale,
-                "localeName": locale.getDisplayName(locale)
-                }
+        return {"locale": self.locale, "localeName": locale.getDisplayName(locale)}
 
 
 def parse_record_status(event, element):
-    userId = element.find('userId')
-    status = element.find('status')
+    userId = element.find("userId")
+    status = element.find("status")
+
+    event["name"] = "record_status"
+    event["user_id"] = userId.text
+    event["status"] = status.text == "true"
 
-    event['name'] = 'record_status'
-    event['user_id'] = userId.text
-    event['status'] = (status.text == 'true')
 
 def parse_caption_edit(event, element):
-    locale = element.find('locale')
-    text = element.find('text')
-    startIndex = element.find('startIndex')
-    endIndex = element.find('endIndex')
-    localeCode = element.find('localeCode')
-
-    event['name'] = 'edit_caption_history'
-    event['locale_name'] = locale.text
+    locale = element.find("locale")
+    text = element.find("text")
+    startIndex = element.find("startIndex")
+    endIndex = element.find("endIndex")
+    localeCode = element.find("localeCode")
+
+    event["name"] = "edit_caption_history"
+    event["locale_name"] = locale.text
     if localeCode is not None:
-        event['locale'] = localeCode.text
+        event["locale"] = localeCode.text
     else:
         # Fallback for missing 'localeCode'
-        event['locale'] = "en"
+        event["locale"] = "en"
     if text.text is None:
-        event['text'] = list()
+        event["text"] = ""
     else:
-        event['text'] = list(text.text)
-    event['start_index'] = int(startIndex.text)
-    event['end_index'] = int(endIndex.text)
+        event["text"] = text.text
+    event["start_index"] = int(startIndex.text)
+    event["end_index"] = int(endIndex.text)
 
 
 def parse_events(directory="."):
@@ -353,22 +366,22 @@ def parse_events(directory="."):
             event = {}
 
             # Convert timestamps to be in seconds from recording start
-            timestamp = int(element.attrib['timestamp'])
+            timestamp = int(element.attrib["timestamp"])
             if not start_time:
                 start_time = timestamp
             timestamp = timestamp - start_time
 
             # Only need events from these modules
-            if not element.attrib['module'] in ['CAPTION','PARTICIPANT']:
+            if not element.attrib["module"] in ["CAPTION", "PARTICIPANT"]:
                 continue
 
-            event['name'] = name = element.attrib['eventname']
-            event['timestamp'] = timestamp
+            event["name"] = name = element.attrib["eventname"]
+            event["timestamp"] = timestamp
 
-            if name == 'RecordStatusEvent':
+            if name == "RecordStatusEvent":
                 parse_record_status(event, element)
                 have_record_events = True
-            elif name == 'EditCaptionHistoryEvent':
+            elif name == "EditCaptionHistoryEvent":
                 parse_caption_edit(event, element)
             else:
                 logger.debug("Unhandled event: %s", name)
@@ -381,25 +394,31 @@ def parse_events(directory="."):
     if not have_record_events:
         # Add a fake record start event to the events list
         event = {
-                'name': 'record_status',
-                'user_id': None,
-                'timestamp': 0,
-                'status': True
-                }
+            "name": "record_status",
+            "user_id": None,
+            "timestamp": 0,
+            "status": True,
+        }
         events.appendleft(event)
 
     return events
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-            description="Generate WebVTT files from BigBlueButton captions",
-            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-i", "--input", metavar="PATH",
-            help="input directory with events.xml file",
-            default=os.curdir)
-    parser.add_argument("-o", "--output", metavar="PATH",
-            help="output directory",
-            default=os.curdir)
+        description="Generate WebVTT files from BigBlueButton captions",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        metavar="PATH",
+        help="input directory with events.xml file",
+        default=os.curdir,
+    )
+    parser.add_argument(
+        "-o", "--output", metavar="PATH", help="output directory", default=os.curdir
+    )
     args = parser.parse_args()
 
     rawdir = args.input
@@ -419,6 +438,6 @@ if __name__ == "__main__":
     filename = os.path.join(outputdir, "captions.json")
     logger.info("Writing captions index file to %s", filename)
 
-    caption_descs = [ caption.caption_desc() for caption in captions.values() ]
+    caption_descs = [caption.caption_desc() for caption in captions.values()]
     with open(filename, "w") as f:
         json.dump(caption_descs, f)
-- 
GitLab