Skip to content

Commit

Permalink
unique_track renamed one_token_stream, convert_sequence_to_tokseq met…
Browse files Browse the repository at this point in the history
…hod, io_format property
  • Loading branch information
Natooz committed Jul 24, 2023
1 parent 0eef074 commit 8c2349b
Show file tree
Hide file tree
Showing 13 changed files with 171 additions and 99 deletions.
18 changes: 12 additions & 6 deletions docs/midi_tokenizer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,16 @@ Tokens & TokSequence input / output format

Depending on the tokenizer at use, the **format** of the tokens returned by the ``midi_to_tokens`` method may vary, as well as the expected format for the ``tokens_to_midi`` method. For any tokenizer, the format is the same for both methods.

The format is deduced from the ``is_multi_voc`` and ``unique_track`` tokenizer properties. In short: **unique_track** being True means that the tokenizer will convert a MIDI file into a single stream of tokens for all instrument tracks, otherwise it will convert each track to a distinct token stream; **is_mult_voc** being True means that each token stream is a list of lists of tokens, of shape ``(T,C)`` for T time steps and C subtokens per time step.
The format is deduced from the ``is_multi_voc`` and ``one_token_stream`` tokenizer properties. In short: **one_token_stream** being True means that the tokenizer will convert a MIDI file into a single stream of tokens for all instrument tracks, otherwise it will convert each track to a distinct token stream; **is_mult_voc** being True means that each token stream is a list of lists of tokens, of shape ``(T,C)`` for T time steps and C subtokens per time step.

This results in four situations, where I is the number of tracks, T is the number of tokens (or time steps) and C the number of subtokens per time step:

* **is_multi_voc** and **unique_track** are both **False**: ``[I,(T)]``
* **is_multi_voc** is **False** and **unique_track** is **True**: ``(T)``
* **is_multi_voc** is **True** and **unique_track** is **False**: ``[I,(T,C)]``
* **is_multi_voc** and **unique_track** are both **True**: ``(T,C)``
* **is_multi_voc** and **one_token_stream** are both **False**: ``[I,(T)]``
* **is_multi_voc** is **False** and **one_token_stream** is **True**: ``(T)``
* **is_multi_voc** is **True** and **one_token_stream** is **False**: ``[I,(T,C)]``
* **is_multi_voc** and **one_token_stream** are both **True**: ``(T,C)``

**Note that if there is no I dimension in the format, the output of ``midi_to_tokens`` is a ``TokSequence`` object, otherwise it is a list of ``TokSequence`` objects (one per token stream / track).**
**Note that if there is no I dimension in the format, the output of **``midi_to_tokens``** is a **:class:`miditok.TokSequence`** object, otherwise it is a list of **:class:`miditok.TokSequence`** objects (one per token stream / track).**

Some tokenizer examples to illustrate:

Expand All @@ -163,6 +163,12 @@ Some tokenizer examples to illustrate:
* **Octuple** is a multi-voc tokenizer and converts all MIDI track to a single stream of tokens, hence it will convert MIDI files to a ``TokSequence`` object, ``(T,C)`` format.


**You can use the **``convert_sequence_to_tokseq``** method to automatically convert a input sequence, of ids (integers) or tokens (string), into a **:class:`miditok.TokSequence`** or list of **:class:`miditok.TokSequence`** objects with the appropriate format of the tokenizer being used.**

.. autofunction:: miditok.convert_sequence_to_tokseq
:noindex:


Magic methods
------------------------

Expand Down
2 changes: 1 addition & 1 deletion miditok/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
MuMIDI,
MMM,
)
from .midi_tokenizer import MIDITokenizer
from .midi_tokenizer import MIDITokenizer, convert_sequence_to_tokseq
from .classes import Event, TokSequence, TokenizerConfig

from .utils import utils
Expand Down
14 changes: 7 additions & 7 deletions miditok/data_augmentation/data_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def data_augmentation_dataset(
file = json.load(json_file)
ids, programs = file["ids"], file["programs"]

if tokenizer.unique_track:
if tokenizer.one_token_stream:
ids = [ids]

# Perform data augmentation for each track
Expand All @@ -91,9 +91,9 @@ def data_augmentation_dataset(
] = {}
for track, (_, is_drum) in zip(ids, programs):
# we dont augment drums
if not tokenizer.unique_track and is_drum:
if not tokenizer.one_token_stream and is_drum:
continue
elif tokenizer.unique_track and all(p[1] for p in programs):
elif tokenizer.one_token_stream and all(p[1] for p in programs):
continue
corrected_offsets = deepcopy(offsets)
vel_dim = int(128 / len(tokenizer.velocities))
Expand All @@ -109,14 +109,14 @@ def data_augmentation_dataset(
if len(aug) == 0:
continue
for aug_offsets, seq in aug:
if tokenizer.unique_track:
if tokenizer.one_token_stream:
augmented_tokens[aug_offsets] = seq
continue
try:
augmented_tokens[aug_offsets].append(seq)
except KeyError:
augmented_tokens[aug_offsets] = [seq]
if not tokenizer.unique_track:
if not tokenizer.one_token_stream:
for i, (track, (_, is_drum)) in enumerate(
zip(ids, programs)
): # adding drums to all already augmented
Expand All @@ -142,7 +142,7 @@ def data_augmentation_dataset(
nb_augmentations += 1
nb_tracks_augmented += len(tracks_seq)
if copy_original_in_new_location and out_path is not None:
if tokenizer.unique_track:
if tokenizer.one_token_stream:
ids = ids[0]
tokenizer.save_tokens(
ids, out_path / f"{file_path.stem}.json", programs
Expand Down Expand Up @@ -455,7 +455,7 @@ def data_augmentation_tokens(
note_off_tokens = np.array(tokenizer.token_ids_of_type("NoteOff"))
mask_pitch = np.isin(tokens, pitch_tokens)
# If applicable, removes drum notes from the mask
if tokenizer.unique_track:
if tokenizer.one_token_stream:
for idx, is_note in enumerate(mask_pitch):
if (
is_note
Expand Down
184 changes: 125 additions & 59 deletions miditok/midi_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,83 @@
)


def convert_sequence_to_tokseq(
tokenizer,
input_seq,
complete_seq: bool = True,
decode_bpe: bool = True
) -> Union[TokSequence, List[TokSequence]]:
r"""Converts a sequence into a **:class:`miditok.TokSequence`** or list of **:class:`miditok.TokSequence`**
objects with the appropriate format of the tokenizer being used.
:param tokenizer: tokenizer being used with the sequence.
:param input_seq: sequence to convert. It can be a list of ids (integers), tokens (string) or events (Event).
It can also be a Pytorch or TensorFlow tensor, or Numpy array representing ids.
:param complete_seq: will complete the output sequence(s). (default: True)
:param decode_bpe: if the input sequence contains ids, and that they contain BPE tokens, these tokens will
be decoded. (default: True)
:return:
"""
# Deduce the type of data (ids/tokens/events)
try:
arg = ("ids", convert_ids_tensors_to_list(input_seq))
except (AttributeError, ValueError, TypeError, IndexError):
if isinstance(input_seq[0], str) or (
isinstance(input_seq[0], list) and isinstance(input_seq[0][0], str)
):
arg = ("tokens", input_seq)
else: # list of Event, but unlikely
arg = ("events", input_seq)

# Deduce nb of subscripts / dims
nb_io_dims = len(tokenizer.io_format)
nb_seq_dims = 1
if isinstance(arg[1][0], list):
nb_seq_dims += 1
if isinstance(arg[1][0][0], list):
nb_seq_dims += 1

# Check the number of dimensions is good
# In case of no one_token_stream and one dimension short --> unsqueeze
if not tokenizer.one_token_stream and nb_seq_dims == nb_io_dims - 1:
print(f"The input sequence has one dimension less than expected ({nb_seq_dims} instead of "
f"{nb_io_dims}). It is being unsqueezed to conform with the tokenizer's i/o format "
f"({tokenizer.io_format})")
arg = (arg[0], [arg[1]])

elif nb_seq_dims != nb_io_dims:
raise ValueError(f"The input sequence does not have the expected dimension "
f"({nb_seq_dims} instead of {nb_io_dims}).")

# Convert to TokSequence
if not tokenizer.one_token_stream and nb_io_dims == nb_seq_dims:
seq = []
for obj in arg[1]:
kwarg = {arg[0]: obj}
seq.append(TokSequence(**kwarg))
if not tokenizer.is_multi_voc:
seq[-1].ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(
seq[-1].ids
)
else: # 1 subscript, one_token_stream and no multi-voc
kwarg = {arg[0]: arg[1]}
seq = TokSequence(**kwarg)
if not tokenizer.is_multi_voc:
seq.ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(seq.ids)

# decode BPE and complete the output sequence(s) if requested
if tokenizer.has_bpe and decode_bpe:
tokenizer.decode_bpe(seq)
if complete_seq:
if isinstance(seq, TokSequence):
tokenizer.complete_sequence(seq)
else:
for seq_ in seq:
tokenizer.complete_sequence(seq_)

return seq


def _in_as_seq(complete: bool = True, decode_bpe: bool = True):
r"""Decorator creating if necessary and completing a TokSequence object before that the function is called.
This decorator is made to be used by the :py:meth:`miditok.MIDITokenizer.tokens_to_midi` method.
Expand All @@ -47,50 +124,16 @@ def wrapper(*args, **kwargs):
if not isinstance(seq, TokSequence) and not all(
isinstance(seq_, TokSequence) for seq_ in seq
):
try:
arg = ("ids", convert_ids_tensors_to_list(seq))
except (AttributeError, ValueError, TypeError, IndexError):
if isinstance(seq[0], str) or (
isinstance(seq[0], str) and isinstance(seq[0][0], str)
):
arg = ("tokens", seq)
else: # list of Event, very unlikely
arg = ("events", seq)

# Deduce nb of subscript, if tokenizer is multi-voc or unique_track
nb_subscripts = nb_real_subscripts = 1
if not tokenizer.unique_track:
nb_subscripts += 1
if tokenizer.is_multi_voc:
nb_subscripts += 1
if isinstance(arg[1][0], list):
nb_real_subscripts += 1
if isinstance(arg[1][0][0], list):
nb_real_subscripts += 1

if not tokenizer.unique_track and nb_subscripts == nb_real_subscripts:
seq = []
for obj in arg[1]:
kwarg = {arg[0]: obj}
seq.append(TokSequence(**kwarg))
if not tokenizer.is_multi_voc:
seq[-1].ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(
seq[-1].ids
)
else: # 1 subscript, unique_track and no multi-voc
kwarg = {arg[0]: arg[1]}
seq = TokSequence(**kwarg)
if not tokenizer.is_multi_voc:
seq.ids_bpe_encoded = tokenizer._are_ids_bpe_encoded(seq.ids)

if tokenizer.has_bpe and decode_bpe:
tokenizer.decode_bpe(seq)
if complete:
if isinstance(seq, TokSequence):
tokenizer.complete_sequence(seq)
else:
for seq_ in seq:
tokenizer.complete_sequence(seq_)
seq = convert_sequence_to_tokseq(tokenizer, seq, complete, decode_bpe)
else:
if tokenizer.has_bpe and decode_bpe:
tokenizer.decode_bpe(seq)
if complete:
if isinstance(seq, TokSequence):
tokenizer.complete_sequence(seq)
else:
for seq_ in seq:
tokenizer.complete_sequence(seq_)

args = list(args)
args[1] = seq
Expand All @@ -117,9 +160,9 @@ class MIDITokenizer(ABC):
r"""MIDI tokenizer base class, containing common methods and attributes for all tokenizers.
:param tokenizer_config: the tokenizer's configuration, as a :class:`miditok.classes.TokenizerConfig` object.
:param unique_track: set to True if the tokenizer works only with a unique track.
Tokens will be saved as a single track. This applies to representations that natively handle
multiple tracks such as Octuple, resulting in a single "stream" of tokens for all tracks.
:param one_token_stream: give True if the tokenizer handle all the tracks of a MIDI as a single sequence of tokens.
Tokens will be saved as a single sequence. This applies to representations that natively handle
multiple tracks such as Octuple or REMIPlus, resulting in a single "stream" of tokens per MIDI.
This attribute will be saved in config files of the tokenizer. (default: False)
:param params: path to a tokenizer config file. This will override other arguments and
load the tokenizer based on the config file. This is particularly useful if the
Expand All @@ -129,7 +172,7 @@ class MIDITokenizer(ABC):
def __init__(
self,
tokenizer_config: TokenizerConfig = None,
unique_track: bool = False,
one_token_stream: bool = False,
params: Union[str, Path] = None,
):
# Initialize params
Expand Down Expand Up @@ -162,7 +205,7 @@ def __init__(
assert (
0 < self.config.nb_velocities < 128
), "You must specify a nb_velocities between 1 and 127 (included)"
self.unique_track = unique_track
self.one_token_stream = one_token_stream

# Tweak the tokenizer's configuration and / or attributes before creating the vocabulary
# This method is intended to be overridden by inheriting tokenizer classes
Expand Down Expand Up @@ -457,7 +500,7 @@ def midi_to_tokens(
:param midi: the MIDI object to convert.
:param apply_bpe_if_possible: will apply BPE if the tokenizer's vocabulary was learned with.
:return: a :class:`miditok.TokSequence` if `tokenizer.unique_track` is true, else a list of
:return: a :class:`miditok.TokSequence` if `tokenizer.one_token_stream` is true, else a list of
:class:`miditok.TokSequence` objects.
"""
# Check if the durations values have been calculated before for this time division
Expand Down Expand Up @@ -619,14 +662,14 @@ def tokens_to_midi(
:param tokens: tokens to convert. Can be either a list of :class:`miditok.TokSequence`,
a Tensor (PyTorch and Tensorflow are supported), a numpy array or a Python list of ints.
The first dimension represents tracks, unless the tokenizer handle tracks altogether as a
single token sequence (e.g. Octuple, MuMIDI): tokenizer.unique_track == True.
single token sequence (e.g. Octuple, MuMIDI): tokenizer.one_token_stream == True.
:param programs: programs of the tracks. If none is given, will default to piano, program 0. (default: None)
:param output_path: path to save the file. (default: None)
:param time_division: MIDI time division / resolution, in ticks/beat (of the MIDI to create).
:return: the midi object (miditoolkit.MidiFile).
"""
midi = MidiFile(ticks_per_beat=time_division)
# if self.unique_track:
# if self.one_token_stream:
# tokens = [tokens]
for i, track_tokens in enumerate(tokens):
if programs is not None:
Expand Down Expand Up @@ -1041,7 +1084,7 @@ def learn_bpe(
sample["ids"], as_one_str=True
) # list of str (bytes)
iterator += (
[[byte_] for byte_ in bytes_] if not self.unique_track else [bytes_]
[[byte_] for byte_ in bytes_] if not self.one_token_stream else [bytes_]
)

# This doesn't seem to work, the trainer pre-processes the sequences, but then no word remains
Expand Down Expand Up @@ -1178,7 +1221,7 @@ def apply_bpe_to_dataset(
sample = self.load_tokens(path)
seq = (
TokSequence(ids=sample["ids"])
if self.unique_track
if self.one_token_stream
else [TokSequence(ids=track) for track in sample["ids"]]
)
self.apply_bpe(seq)
Expand Down Expand Up @@ -1460,7 +1503,7 @@ def save_params(
}
params = {
"config": dict_config,
"unique_track": self.unique_track,
"one_token_stream": self.one_token_stream,
"has_bpe": self.has_bpe,
"tokenization": self.__class__.__name__,
"miditok_version": CURRENT_VERSION_PACKAGE,
Expand Down Expand Up @@ -1532,6 +1575,9 @@ def _load_params(self, config_file_path: Union[str, Path]):
key = old_add_tokens_attr[key]
setattr(self.config, key, value)
continue
elif key == "unique_track":
# For config files <= v2.1.1 before the attribute is renamed
self.one_token_stream = value

setattr(self, key, value)

Expand All @@ -1544,6 +1590,17 @@ def is_multi_voc(self) -> bool:
"""
return isinstance(self._vocab_base, list)

@property
def io_format(self) -> Tuple[str]:
format_ = []
if not self.one_token_stream:
format_.append("I")
format_.append("T")
if self.is_multi_voc:
format_.append("C")

return tuple(d for d in format_)

def __call__(self, obj: Any, *args, **kwargs):
r"""Calling a tokenizer allows to directly convert a MIDI to tokens or the other way around.
The method automatically detects MIDI and token objects, as well as paths and can directly load
Expand Down Expand Up @@ -1599,13 +1656,22 @@ def len(self) -> Union[int, List[int]]:
return [len(v) for v in self.vocab] if self.is_multi_voc else len(self)

def __repr__(self):
out_str = f"{self.len} tokens"
out_str = f"{self.len} tokens with {self.io_format} io format"

# one_token_stream / multi-voc
tmp = []
if self.one_token_stream:
tmp.append("one token stream")
if self.is_multi_voc:
out_str += " (multi-voc)"
tmp.append("multi-voc")
if len(tmp) > 0:
out_str += f"({', '.join(tmp)})"

# BPE
if self.has_bpe:
out_str += " with BPE"
out_str += ", with BPE"
else:
out_str += " without BPE"
out_str += ", without BPE"
return out_str

def __getitem__(
Expand Down
2 changes: 1 addition & 1 deletion miditok/tokenizations/mumidi.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(
def _tweak_config_before_creating_voc(self):
self.config.use_rests = False
self.config.use_time_signatures = False
# self.unique_track = True
# self.one_token_stream = True

self.vocab_types_idx = {
"Pitch": 0,
Expand Down
Loading

0 comments on commit 8c2349b

Please sign in to comment.