Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Interval search #898

Merged
merged 6 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 87 additions & 11 deletions app/public/cantusdata/helpers/mei_processing/mei_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
Defines associated types for the data structures used by the parser.
"""

from typing import Tuple, Dict, List, Iterator, Optional
from typing import Tuple, Dict, List, Iterator, Optional, Literal
from lxml import etree # pylint: disable=no-name-in-module
from .mei_parsing_types import (
Zone,
Expand Down Expand Up @@ -130,8 +130,10 @@ def _parse_syllable_text(self, syl_elem: Optional[etree.Element]) -> SyllableTex
:param syllable: A syllable element from an MEI file
:return: Dictionary of syllable text data
"""
if syl_elem is not None and syl_elem.text:
text_dict: SyllableText = {
# Ignoring type of next two expressions because for some reason
# mypy thinks they are unreachable, but we know they are not.
if syl_elem is not None and syl_elem.text: # type: ignore
text_dict: SyllableText = { # type: ignore
"text": syl_elem.text.strip(),
"bounding_box": self._get_element_zone(syl_elem),
}
Expand Down Expand Up @@ -183,22 +185,29 @@ def _parse_neume(
)
if parsed_neume_component:
parsed_nc_elements.append(parsed_neume_component)
neume_name, intervals, contours = analyze_neume(parsed_nc_elements)
neume_name, semitone_intervals, contours, intervals = analyze_neume(
parsed_nc_elements
)
# If the first neume component of the next syllable can be parsed,
# add the interval and contour between the final neume component of
# add intervals and contour between the final neume component of
# the current syllable and the first neume component of the next syllable.
if next_neume_component is not None:
parsed_next_neume_comp: Optional[NeumeComponentElementData] = (
self._parse_neume_component_element(next_neume_component)
)
if parsed_next_neume_comp:
last_neume_comp = parsed_nc_elements[-1]
intervals.append(
semitone_intervals.append(
get_semitones_between_neume_components(
last_neume_comp, parsed_next_neume_comp
)
)
contours.append(get_contour_from_interval(intervals[-1]))
contours.append(get_contour_from_interval(semitone_intervals[-1]))
intervals.append(
get_melodic_interval(
semitone_intervals[-1], last_neume_comp["pname"]
)
)
# Get a bounding box for the neume by combining bounding boxes of
# its components. Note that a single neume does not span multiple
# systems, so the combined bounding box will be a single zone.
Expand All @@ -212,8 +221,11 @@ def _parse_neume(
"pname": nc["pname"],
"octave": nc["octave"],
"bounding_box": nc["bounding_box"],
"semitone_interval": intervals[i] if i < len(intervals) else None,
"semitone_interval": (
semitone_intervals[i] if i < len(semitone_intervals) else None
),
"contour": contours[i] if i < len(contours) else None,
"interval": intervals[i] if i < len(intervals) else None,
"system": neume_system,
}
)
Expand Down Expand Up @@ -403,7 +415,7 @@ def get_contour_from_interval(interval: int) -> ContourType:
"""
Compute the contour of an interval.

:param interval: The size of the interval in semitones
:param interval: The size of the interval in semitones or steps
:return: The contour of the interval ("u"[p], "d"[own], or "r"[epeat])
"""
if interval < 0:
Expand All @@ -413,20 +425,81 @@ def get_contour_from_interval(interval: int) -> ContourType:
return "r"


INTERVAL_TO_STEP_MAP: Dict[int, int] = {
0: 1, # unison
1: 2, # minor 2nd
2: 2, # major 2nd
3: 3, # minor 3rd
4: 3, # major 3rd
5: 4, # perfect 4th
# we handle 6 semitones separately below
# b/c we'll treat it as a 4th or 5th depending
# on the starting pitch and direction of the interval
7: 5, # perfect 5th
8: 6, # minor 6th
9: 6, # major 6th
10: 7, # minor 7th
11: 7, # major 7th
}


def get_melodic_interval(semitone_interval: int, starting_pitch_name: str) -> int:
"""
Uses the semitone size of an interval and the starting pitch
name to determine the size of a melodic interval.
In most cases, the interval is determined by the number of
semitones between the pitches. However, in the case of a 6-semitone
interval, the interval is determined by the starting pitch name
and the contour of the interval.

:param semitone_interval: The size of the interval in semitones
:param starting_pitch_name: The pitch name of the starting pitch

:return: A integer representing the size of the interval in steps,
a positive integer for an ascending interval and a negative
integer for a descending interval.
"""
if semitone_interval == 0:
return 1
interval_magnitude = abs(semitone_interval)
interval_direction = semitone_interval // interval_magnitude
interval_octaves = interval_magnitude // 12
interval_mod_12 = interval_magnitude % 12
interval: int
if interval_mod_12 == 6:
# Note: Since we don't currently handle any accidentals,
# intervals between b's and f's are the only place we'll
# see a 6-semitone interval.
match (starting_pitch_name, interval_direction > 0):
case ("b", True): # b up to f is a 5th
interval = 5
case ("b", False): # b down to f is a 4th
interval = 4
case ("f", True): # f up to b is a 4th
interval = 4
case ("f", False): # f down to b is a 5th
interval = 5
else:
interval = INTERVAL_TO_STEP_MAP[interval_mod_12]
return (interval + 7 * interval_octaves) * interval_direction


def analyze_neume(
neume: List[NeumeComponentElementData],
) -> Tuple[NeumeName, List[int], List[ContourType]]:
) -> Tuple[NeumeName, List[int], List[ContourType], List[int]]:
"""
Analyze a neume (a list of neume components) to determine:
- The neume type (e.g., punctum, pes, clivis, etc.)
- The intervals in the neume in semitones
- The contour of the nueme
- The intervals in the neume in steps ("3rd", "4th")

:param neume: A list of neume components (a list of NeumeComponentsType dictionaries)
:return: A tuple of information about the neume:
- Neume type (str)
- Neume intervals in semitones (list of ints)
- Neume contour (list of "u"[p], "d"[own], or "r"[epeat])
- Neume intervals in steps (list of ints)
"""
semitone_intervals: List[int] = [
get_semitones_between_neume_components(nc1, nc2)
Expand All @@ -435,5 +508,8 @@ def analyze_neume(
contours: List[ContourType] = [
get_contour_from_interval(i) for i in semitone_intervals
]
intervals: List[int] = [
get_melodic_interval(i, nc["pname"]) for i, nc in zip(semitone_intervals, neume)
]
neume_type: NeumeName = NEUME_GROUPS.get("".join(contours), "compound")
return neume_type, semitone_intervals, contours
return neume_type, semitone_intervals, contours, intervals
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,14 @@ class NeumeComponent(NeumeComponentElementData):
this is None.
contour: The contour ("u"[p], "d"[own], or "r"[epeat]) of 'interval'. If there is no
following neume component, this is None.
interval: The interval (2nd, 5th, etc) between the neume component and the following
neume component. If there is no following neume component, this is None.
system: The system number that the neume component is on
"""

semitone_interval: Optional[int]
contour: Optional[ContourType]
interval: Optional[int]
system: int


Expand Down Expand Up @@ -130,6 +133,8 @@ class NgramDocument(TypedDict):
by underscores.
semitone_interval: A string containing the semitone intervals between the neume components
in the n-gram, separated by underscores.
intervals: A string containing the intervals between the neume components in the n-gram,
separated by underscores.
neume_names: A string containing the names of the neumes in the n-gram,
separated by underscores. This field is not required, and is only present when
the n-gram contains complete neumes.
Expand All @@ -146,6 +151,7 @@ class NgramDocument(TypedDict):
pitch_names: str
contour: str
semitone_intervals: str
intervals: str
neume_names: NotRequired[str]
manuscript_id: NotRequired[str]
folio: NotRequired[str]
Expand Down
31 changes: 22 additions & 9 deletions app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def flattened_neumes(self) -> List[Neume]:
def _stringify_neume_component_data(
self,
neume_components: List[NeumeComponent],
) -> Tuple[str, str, str]:
) -> Tuple[str, str, str, str]:
"""
Convert pitch, contour, and interval information from a list of
neume components into strings.
Expand All @@ -59,14 +59,26 @@ def _stringify_neume_component_data(
pnames: List[str] = []
contours: List[ContourType] = []
semitone_intervals: List[str] = []
intervals: List[str] = []
for idx, nc in enumerate(neume_components):
pnames.append(nc["pname"])
# The interval is None if and only if the countour is None,
# The semitone_interval is None if and only if the countour is None,
# so we can safely do this single check.
if nc["contour"] is not None and idx != len(neume_components) - 1:
contours.append(nc["contour"])
semitone_intervals.append(str(nc["semitone_interval"]))
return "_".join(pnames), "_".join(contours), "_".join(semitone_intervals)
# Collect one fewer contour/interval than the number of pitches so
# that the pitches and contours/intervals are aligned (number of intervals/
# contours = number of pitches - 1).
if idx != len(neume_components) - 1:
if nc["contour"] is not None:
contours.append(nc["contour"])
semitone_intervals.append(str(nc["semitone_interval"]))
if nc["interval"] is not None:
intervals.append(str(nc["interval"]))
return (
"_".join(pnames),
"_".join(contours),
"_".join(semitone_intervals),
"_".join(intervals),
)

def _create_document_from_neume_components(
self,
Expand All @@ -80,8 +92,8 @@ def _create_document_from_neume_components(
and the system number of that neume component.
:return: An NgramDocument containing the information from the neume components.
"""
pitch_names, contour, intervals = self._stringify_neume_component_data(
neume_components
pitch_names, contour, semitone_intervals, intervals = (
self._stringify_neume_component_data(neume_components)
)
zones_with_sys: List[Tuple[Zone, int]] = [
(nc["bounding_box"], nc["system"]) for nc in neume_components
Expand All @@ -91,7 +103,8 @@ def _create_document_from_neume_components(
"location_json": location,
"pitch_names": pitch_names,
"contour": contour,
"semitone_intervals": intervals,
"semitone_intervals": semitone_intervals,
"intervals": intervals,
"id": str(uuid.uuid4()),
"type": "omr_ngram",
}
Expand Down
53 changes: 53 additions & 0 deletions app/public/cantusdata/helpers/search_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
A collection of functions used by search views to validate and process
queries.
"""

# Contains the words that are allowed
# in a neume_name query
VALID_NEUME_NAME_WORDS = {
Expand All @@ -18,6 +23,22 @@
}


def validate_intervals_query_word(word: str) -> bool:
"""
Returns True if the "word" is valid in an intervals query.

Valid words are one of the letters "u"[p], "d"[own], "r"[epeat].
A "u" or "d" must be followed by a positive integer > 1.
"""
if word == "r":
return True
if word[0] in {"u", "d"}:
interval_mag = word[1:]
if interval_mag.isdigit() and interval_mag != "1":
return True
return False


def validate_query(q: list[str], q_type: str) -> bool:
"""
Depending on the type of the query, returns True if the query is valid
Expand All @@ -29,6 +50,8 @@ def validate_query(q: list[str], q_type: str) -> bool:
return all(pitch in "abcdefg" for pitch in q)
case "contour":
return all(contour in "udr" for contour in q)
case "intervals":
return all(validate_intervals_query_word(word) for word in q)
case _:
return False

Expand Down Expand Up @@ -65,3 +88,33 @@ def get_transpositions(sequence: list[str]) -> list[list[str]]:
transposed_chars = list(map(chr, asciinum))
transpositions.append(transposed_chars)
return transpositions


def translate_interval_query_direction(query_terms: list[str]) -> list[str]:
"""
Translate the terms of an interval query (alphanumeric strings; e.g. "u3", "d2", "r")
as entered by a user into the format that is used in the Solr query (integer strings;
e.g. "3", "-2", "1").

Terms are translated as follows:
- "r" translates to "1"
- a "u" indicates an ascending interval, and it translated to a positive
integer
- a "d" indicates a descending interval, and it translated to a negative
integer

:param query_terms: a list of strings representing the terms of the interval query;
it is assumed that these have already been validated by validate_intervals_query_word

:return: a list of strings representing the terms of the interval query in the format
used in the Solr query
"""
solr_query: list[str] = []
for term in query_terms:
if term == "r":
solr_query.append("1")
else:
direction = "-" if term[0] == "d" else ""
magnitude = term[1:]
solr_query.append(f"{direction}{magnitude}")
return solr_query
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer
from cantusdata.models.folio import Folio


MEI4_DIR = path.join("/code", "production-mei-files")
FOLIO_NUMBER_REGEX = re.compile(r"[a-zA-Z]?\d+[a-z]?")

Expand Down Expand Up @@ -93,7 +92,8 @@ def handle(self, *args: Any, **options: Any) -> None:
folio_number: str = mei_file.split("_")[-1].split(".")[0]
if not FOLIO_NUMBER_REGEX.match(folio_number):
raise ValueError(
f"MEI file {mei_file} does not match the expected format."
f"Folio number {folio_number} in MEI file {mei_file}"
"does not exist in the database."
)
if not folio_number in folio_map or folio_map[folio_number] == "":
self.stdout.write(
Expand Down
Loading
Loading