unique_track renamed one_token_stream, convert_sequence_to_tokseq met…

…hod, io_format property
Natooz · Jul 24, 2023 · 8c2349b · 8c2349b
1 parent 0eef074
commit 8c2349b
Show file tree

Hide file tree

Showing 13 changed files with 171 additions and 99 deletions.
diff --git a/docs/midi_tokenizer.rst b/docs/midi_tokenizer.rst
@@ -144,16 +144,16 @@ Tokens & TokSequence input / output format
 
 Depending on the tokenizer at use, the **format** of the tokens returned by the ``midi_to_tokens`` method may vary, as well as the expected format for the ``tokens_to_midi`` method. For any tokenizer, the format is the same for both methods.
 
-The format is deduced from the ``is_multi_voc`` and ``unique_track`` tokenizer properties. In short: **unique_track** being True means that the tokenizer will convert a MIDI file into a single stream of tokens for all instrument tracks, otherwise it will convert each track to a distinct token stream; **is_mult_voc** being True means that each token stream is a list of lists of tokens, of shape ``(T,C)`` for T time steps and C subtokens per time step.
+The format is deduced from the ``is_multi_voc`` and ``one_token_stream`` tokenizer properties. In short: **one_token_stream** being True means that the tokenizer will convert a MIDI file into a single stream of tokens for all instrument tracks, otherwise it will convert each track to a distinct token stream; **is_mult_voc** being True means that each token stream is a list of lists of tokens, of shape ``(T,C)`` for T time steps and C subtokens per time step.
 
 This results in four situations, where I is the number of tracks, T is the number of tokens (or time steps) and C the number of subtokens per time step:
 
-* **is_multi_voc** and **unique_track** are both **False**: ``[I,(T)]``
-* **is_multi_voc** is **False** and **unique_track** is **True**: ``(T)``
-* **is_multi_voc** is **True** and **unique_track** is **False**: ``[I,(T,C)]``
-* **is_multi_voc** and **unique_track** are both **True**: ``(T,C)``
+* **is_multi_voc** and **one_token_stream** are both **False**: ``[I,(T)]``
+* **is_multi_voc** is **False** and **one_token_stream** is **True**: ``(T)``
+* **is_multi_voc** is **True** and **one_token_stream** is **False**: ``[I,(T,C)]``
+* **is_multi_voc** and **one_token_stream** are both **True**: ``(T,C)``
 
-**Note that if there is no I dimension in the format, the output of ``midi_to_tokens`` is a ``TokSequence`` object, otherwise it is a list of ``TokSequence`` objects (one per token stream / track).**
+**Note that if there is no I dimension in the format, the output of **``midi_to_tokens``** is a **:class:`miditok.TokSequence`** object, otherwise it is a list of **:class:`miditok.TokSequence`** objects (one per token stream / track).**
 
 Some tokenizer examples to illustrate:
 
@@ -163,6 +163,12 @@ Some tokenizer examples to illustrate:
 * **Octuple** is a multi-voc tokenizer and converts all MIDI track to a single stream of tokens, hence it will convert MIDI files to a ``TokSequence`` object, ``(T,C)`` format.
 
 
+**You can use the **``convert_sequence_to_tokseq``** method to automatically convert a input sequence, of ids (integers) or tokens (string), into a **:class:`miditok.TokSequence`** or list of **:class:`miditok.TokSequence`** objects with the appropriate format of the tokenizer being used.**
+
+.. autofunction:: miditok.convert_sequence_to_tokseq
+    :noindex:
+
+
 Magic methods
 ------------------------
 

diff --git a/miditok/__init__.py b/miditok/__init__.py
@@ -10,7 +10,7 @@
     MuMIDI,
     MMM,
 )
-from .midi_tokenizer import MIDITokenizer
+from .midi_tokenizer import MIDITokenizer, convert_sequence_to_tokseq
 from .classes import Event, TokSequence, TokenizerConfig
 
 from .utils import utils

diff --git a/miditok/data_augmentation/data_augmentation.py b/miditok/data_augmentation/data_augmentation.py
@@ -72,7 +72,7 @@ def data_augmentation_dataset(
                 file = json.load(json_file)
                 ids, programs = file["ids"], file["programs"]
 
-            if tokenizer.unique_track:
+            if tokenizer.one_token_stream:
                 ids = [ids]
 
             # Perform data augmentation for each track
@@ -91,9 +91,9 @@ def data_augmentation_dataset(
             ] = {}
             for track, (_, is_drum) in zip(ids, programs):
                 # we dont augment drums
-                if not tokenizer.unique_track and is_drum:
+                if not tokenizer.one_token_stream and is_drum:
                     continue
-                elif tokenizer.unique_track and all(p[1] for p in programs):
+                elif tokenizer.one_token_stream and all(p[1] for p in programs):
                     continue
                 corrected_offsets = deepcopy(offsets)
                 vel_dim = int(128 / len(tokenizer.velocities))
@@ -109,14 +109,14 @@ def data_augmentation_dataset(
                 if len(aug) == 0:
                     continue
                 for aug_offsets, seq in aug:
-                    if tokenizer.unique_track:
+                    if tokenizer.one_token_stream:
                         augmented_tokens[aug_offsets] = seq
                         continue
                     try:
                         augmented_tokens[aug_offsets].append(seq)
                     except KeyError:
                         augmented_tokens[aug_offsets] = [seq]
-            if not tokenizer.unique_track:
+            if not tokenizer.one_token_stream:
                 for i, (track, (_, is_drum)) in enumerate(
                     zip(ids, programs)
                 ):  # adding drums to all already augmented
@@ -142,7 +142,7 @@ def data_augmentation_dataset(
                 nb_augmentations += 1
                 nb_tracks_augmented += len(tracks_seq)
             if copy_original_in_new_location and out_path is not None:
-                if tokenizer.unique_track:
+                if tokenizer.one_token_stream:
                     ids = ids[0]
                 tokenizer.save_tokens(
                     ids, out_path / f"{file_path.stem}.json", programs
@@ -455,7 +455,7 @@ def data_augmentation_tokens(
             note_off_tokens = np.array(tokenizer.token_ids_of_type("NoteOff"))
             mask_pitch = np.isin(tokens, pitch_tokens)
             # If applicable, removes drum notes from the mask
-            if tokenizer.unique_track:
+            if tokenizer.one_token_stream:
                 for idx, is_note in enumerate(mask_pitch):
                     if (
                         is_note

diff --git a/miditok/midi_tokenizer.py b/miditok/midi_tokenizer.py
@@ -32,6 +32,83 @@
 )
 
 
+def convert_sequence_to_tokseq(
+        tokenizer,
+        input_seq,
+        complete_seq: bool = True,
+        decode_bpe: bool = True
+) -> Union[TokSequence, List[TokSequence]]:
+    r"""Converts a sequence into a **:class:`miditok.TokSequence`** or list of **:class:`miditok.TokSequence`**
+    objects with the appropriate format of the tokenizer being used.
+
+    :param tokenizer: tokenizer being used with the sequence.
+    :param input_seq: sequence to convert. It can be a list of ids (integers), tokens (string) or events (Event).
+        It can also be a Pytorch or TensorFlow tensor, or Numpy array representing ids.
+    :param complete_seq: will complete the output sequence(s). (default: True)
+    :param decode_bpe: if the input sequence contains ids, and that they contain BPE tokens, these tokens will
+        be decoded. (default: True)
+    :return:
+    """
+    # Deduce the type of data (ids/tokens/events)
+    try:
+        arg = ("ids", convert_ids_tensors_to_list(input_seq))
+    except (AttributeError, ValueError, TypeError, IndexError):
+        if isinstance(input_seq[0], str) or (
+                isinstance(input_seq[0], list) and isinstance(input_seq[0][0], str)
+        ):
+            arg = ("tokens", input_seq)
+        else:  # list of Event, but unlikely
+            arg = ("events", input_seq)
+
+    # Deduce nb of subscripts / dims
+    nb_io_dims = len(tokenizer.io_format)
+    nb_seq_dims = 1
+    if isinstance(arg[1][0], list):
+        nb_seq_dims += 1
+        if isinstance(arg[1][0][0], list):
+            nb_seq_dims += 1
+
+    # Check the number of dimensions is good
+    # In case of no one_token_stream and one dimension short --> unsqueeze
+    if not tokenizer.one_token_stream and nb_seq_dims == nb_io_dims - 1:
+        print(f"The input sequence has one dimension less than expected ({nb_seq_dims} instead of "
+              f"{nb_io_dims}). It is being unsqueezed to conform with the tokenizer's i/o format "
+              f"({tokenizer.io_format})")
+        arg = (arg[0], [arg[1]])
+
+    elif nb_seq_dims != nb_io_dims:
+        raise ValueError(f"The input sequence does not have the expected dimension "
+                         f"({nb_seq_dims} instead of {nb_io_dims}).")
+
+    # Convert to TokSequence
+    if not tokenizer.one_token_stream and nb_io_dims == nb_seq_dims:
+        seq = []
+        for obj in arg[1]:
+            kwarg = {arg[0]: obj}
+            seq.append(TokSequence(**kwarg))
+            if not tokenizer.is_multi_voc:
+                seq[-1].ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(
+                    seq[-1].ids
+                )
+    else:  # 1 subscript, one_token_stream and no multi-voc
+        kwarg = {arg[0]: arg[1]}
+        seq = TokSequence(**kwarg)
+        if not tokenizer.is_multi_voc:
+            seq.ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(seq.ids)
+
+    # decode BPE and complete the output sequence(s) if requested
+    if tokenizer.has_bpe and decode_bpe:
+        tokenizer.decode_bpe(seq)
+    if complete_seq:
+        if isinstance(seq, TokSequence):
+            tokenizer.complete_sequence(seq)
+        else:
+            for seq_ in seq:
+                tokenizer.complete_sequence(seq_)
+
+    return seq
+
+
 def _in_as_seq(complete: bool = True, decode_bpe: bool = True):
     r"""Decorator creating if necessary and completing a TokSequence object before that the function is called.
     This decorator is made to be used by the :py:meth:`miditok.MIDITokenizer.tokens_to_midi` method.
@@ -47,50 +124,16 @@ def wrapper(*args, **kwargs):
             if not isinstance(seq, TokSequence) and not all(
                 isinstance(seq_, TokSequence) for seq_ in seq
             ):
-                try:
-                    arg = ("ids", convert_ids_tensors_to_list(seq))
-                except (AttributeError, ValueError, TypeError, IndexError):
-                    if isinstance(seq[0], str) or (
-                        isinstance(seq[0], str) and isinstance(seq[0][0], str)
-                    ):
-                        arg = ("tokens", seq)
-                    else:  # list of Event, very unlikely
-                        arg = ("events", seq)
-
-                # Deduce nb of subscript, if tokenizer is multi-voc or unique_track
-                nb_subscripts = nb_real_subscripts = 1
-                if not tokenizer.unique_track:
-                    nb_subscripts += 1
-                if tokenizer.is_multi_voc:
-                    nb_subscripts += 1
-                if isinstance(arg[1][0], list):
-                    nb_real_subscripts += 1
-                    if isinstance(arg[1][0][0], list):
-                        nb_real_subscripts += 1
-
-                if not tokenizer.unique_track and nb_subscripts == nb_real_subscripts:
-                    seq = []
-                    for obj in arg[1]:
-                        kwarg = {arg[0]: obj}
-                        seq.append(TokSequence(**kwarg))
-                        if not tokenizer.is_multi_voc:
-                            seq[-1].ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(
-                                seq[-1].ids
-                            )
-                else:  # 1 subscript, unique_track and no multi-voc
-                    kwarg = {arg[0]: arg[1]}
-                    seq = TokSequence(**kwarg)
-                    if not tokenizer.is_multi_voc:
-                        seq.ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(seq.ids)
-
-            if tokenizer.has_bpe and decode_bpe:
-                tokenizer.decode_bpe(seq)
-            if complete:
-                if isinstance(seq, TokSequence):
-                    tokenizer.complete_sequence(seq)
-                else:
-                    for seq_ in seq:
-                        tokenizer.complete_sequence(seq_)
+                seq = convert_sequence_to_tokseq(tokenizer, seq, complete, decode_bpe)
+            else:
+                if tokenizer.has_bpe and decode_bpe:
+                    tokenizer.decode_bpe(seq)
+                if complete:
+                    if isinstance(seq, TokSequence):
+                        tokenizer.complete_sequence(seq)
+                    else:
+                        for seq_ in seq:
+                            tokenizer.complete_sequence(seq_)
 
             args = list(args)
             args[1] = seq
@@ -117,9 +160,9 @@ class MIDITokenizer(ABC):
     r"""MIDI tokenizer base class, containing common methods and attributes for all tokenizers.
 
     :param tokenizer_config: the tokenizer's configuration, as a :class:`miditok.classes.TokenizerConfig` object.
-    :param unique_track: set to True if the tokenizer works only with a unique track.
-            Tokens will be saved as a single track. This applies to representations that natively handle
-            multiple tracks such as Octuple, resulting in a single "stream" of tokens for all tracks.
+    :param one_token_stream: give True if the tokenizer handle all the tracks of a MIDI as a single sequence of tokens.
+            Tokens will be saved as a single sequence. This applies to representations that natively handle
+            multiple tracks such as Octuple or REMIPlus, resulting in a single "stream" of tokens per MIDI.
             This attribute will be saved in config files of the tokenizer. (default: False)
     :param params: path to a tokenizer config file. This will override other arguments and
             load the tokenizer based on the config file. This is particularly useful if the
@@ -129,7 +172,7 @@ class MIDITokenizer(ABC):
     def __init__(
         self,
         tokenizer_config: TokenizerConfig = None,
-        unique_track: bool = False,
+        one_token_stream: bool = False,
         params: Union[str, Path] = None,
     ):
         # Initialize params
@@ -162,7 +205,7 @@ def __init__(
             assert (
                 0 < self.config.nb_velocities < 128
             ), "You must specify a nb_velocities between 1 and 127 (included)"
-            self.unique_track = unique_track
+            self.one_token_stream = one_token_stream
 
         # Tweak the tokenizer's configuration and / or attributes before creating the vocabulary
         # This method is intended to be overridden by inheriting tokenizer classes
@@ -457,7 +500,7 @@ def midi_to_tokens(
 
         :param midi: the MIDI object to convert.
         :param apply_bpe_if_possible: will apply BPE if the tokenizer's vocabulary was learned with.
-        :return: a :class:`miditok.TokSequence` if `tokenizer.unique_track` is true, else a list of
+        :return: a :class:`miditok.TokSequence` if `tokenizer.one_token_stream` is true, else a list of
                 :class:`miditok.TokSequence` objects.
         """
         # Check if the durations values have been calculated before for this time division
@@ -619,14 +662,14 @@ def tokens_to_midi(
         :param tokens: tokens to convert. Can be either a list of :class:`miditok.TokSequence`,
                 a Tensor (PyTorch and Tensorflow are supported), a numpy array or a Python list of ints.
                 The first dimension represents tracks, unless the tokenizer handle tracks altogether as a
-                single token sequence (e.g. Octuple, MuMIDI): tokenizer.unique_track == True.
+                single token sequence (e.g. Octuple, MuMIDI): tokenizer.one_token_stream == True.
         :param programs: programs of the tracks. If none is given, will default to piano, program 0. (default: None)
         :param output_path: path to save the file. (default: None)
         :param time_division: MIDI time division / resolution, in ticks/beat (of the MIDI to create).
         :return: the midi object (miditoolkit.MidiFile).
         """
         midi = MidiFile(ticks_per_beat=time_division)
-        # if self.unique_track:
+        # if self.one_token_stream:
         #    tokens = [tokens]
         for i, track_tokens in enumerate(tokens):
             if programs is not None:
@@ -1041,7 +1084,7 @@ def learn_bpe(
                     sample["ids"], as_one_str=True
                 )  # list of str (bytes)
                 iterator += (
-                    [[byte_] for byte_ in bytes_] if not self.unique_track else [bytes_]
+                    [[byte_] for byte_ in bytes_] if not self.one_token_stream else [bytes_]
                 )
 
             # This doesn't seem to work, the trainer pre-processes the sequences, but then no word remains
@@ -1178,7 +1221,7 @@ def apply_bpe_to_dataset(
             sample = self.load_tokens(path)
             seq = (
                 TokSequence(ids=sample["ids"])
-                if self.unique_track
+                if self.one_token_stream
                 else [TokSequence(ids=track) for track in sample["ids"]]
             )
             self.apply_bpe(seq)
@@ -1460,7 +1503,7 @@ def save_params(
         }
         params = {
             "config": dict_config,
-            "unique_track": self.unique_track,
+            "one_token_stream": self.one_token_stream,
             "has_bpe": self.has_bpe,
             "tokenization": self.__class__.__name__,
             "miditok_version": CURRENT_VERSION_PACKAGE,
@@ -1532,6 +1575,9 @@ def _load_params(self, config_file_path: Union[str, Path]):
                     key = old_add_tokens_attr[key]
                 setattr(self.config, key, value)
                 continue
+            elif key == "unique_track":
+                # For config files <= v2.1.1 before the attribute is renamed
+                self.one_token_stream = value
 
             setattr(self, key, value)
 
@@ -1544,6 +1590,17 @@ def is_multi_voc(self) -> bool:
         """
         return isinstance(self._vocab_base, list)
 
+    @property
+    def io_format(self) -> Tuple[str]:
+        format_ = []
+        if not self.one_token_stream:
+            format_.append("I")
+        format_.append("T")
+        if self.is_multi_voc:
+            format_.append("C")
+
+        return tuple(d for d in format_)
+
     def __call__(self, obj: Any, *args, **kwargs):
         r"""Calling a tokenizer allows to directly convert a MIDI to tokens or the other way around.
         The method automatically detects MIDI and token objects, as well as paths and can directly load
@@ -1599,13 +1656,22 @@ def len(self) -> Union[int, List[int]]:
         return [len(v) for v in self.vocab] if self.is_multi_voc else len(self)
 
     def __repr__(self):
-        out_str = f"{self.len} tokens"
+        out_str = f"{self.len} tokens with {self.io_format} io format"
+
+        # one_token_stream / multi-voc
+        tmp = []
+        if self.one_token_stream:
+            tmp.append("one token stream")
         if self.is_multi_voc:
-            out_str += " (multi-voc)"
+            tmp.append("multi-voc")
+        if len(tmp) > 0:
+            out_str += f"({', '.join(tmp)})"
+
+        # BPE
         if self.has_bpe:
-            out_str += " with BPE"
+            out_str += ", with BPE"
         else:
-            out_str += " without BPE"
+            out_str += ", without BPE"
         return out_str
 
     def __getitem__(

diff --git a/miditok/tokenizations/mumidi.py b/miditok/tokenizations/mumidi.py
@@ -67,7 +67,7 @@ def __init__(
     def _tweak_config_before_creating_voc(self):
         self.config.use_rests = False
         self.config.use_time_signatures = False
-        # self.unique_track = True
+        # self.one_token_stream = True
 
         self.vocab_types_idx = {
             "Pitch": 0,