Skip to content

End Rhymes

poetry_analysis.rhyme_detection

Verse dataclass

Source code in src/poetry_analysis/rhyme_detection.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
@dataclass
class Verse:
    id_: str | int
    rhyme_score: int = 0
    rhyme_tag: str = ""
    text: str = ""
    transcription: str = ""
    tokens: list | None = None
    syllables: list | None = None
    last_token: str | None = None
    rhymes_with: str | int | None = None

    @property
    def dict(self) -> dict:
        """Return the Verse object as a dictionary."""
        dictionary = self.__dict__
        dictionary["verse_id"] = self.id_
        del dictionary["id_"]
        return dictionary

dict property

Return the Verse object as a dictionary.

collate_rhyme_scheme(annotated_stanza)

Join the rhyme tags rom each tagged verse to form a rhyme scheme.

Source code in src/poetry_analysis/rhyme_detection.py
282
283
284
def collate_rhyme_scheme(annotated_stanza: list) -> str:
    """Join the rhyme tags rom each tagged verse to form a rhyme scheme."""
    return "".join(verse.rhyme_tag for verse in annotated_stanza)

find_last_stressed_syllable(syll)

Find the last stressed syllable in a list of syllables.

Source code in src/poetry_analysis/rhyme_detection.py
191
192
193
194
195
196
197
198
def find_last_stressed_syllable(syll):
    """Find the last stressed syllable in a list of syllables."""
    n = len(syll)

    for i in range(1, n + 1):
        if re.search(r"[123]", syll[-i]):
            return syll[-i:]
    return syll[:]

find_last_word(tokens)

Find the last word in a list of tokens.

Source code in src/poetry_analysis/rhyme_detection.py
201
202
203
204
205
206
def find_last_word(tokens: list[str]) -> str:
    """Find the last word in a list of tokens."""
    for token in reversed(tokens):
        if not utils.is_punctuation(token):
            return token
    return ""

find_nucleus(word, orthographic=False)

Check if a word has a valid syllable nucleus.

Source code in src/poetry_analysis/rhyme_detection.py
81
82
83
84
85
86
def find_nucleus(word: str, orthographic: bool = False) -> re.Match | None:
    """Check if a word has a valid syllable nucleus."""
    valid_nuclei = get_valid_nuclei(orthographic=orthographic)
    rgx = re.compile(rf"({'|'.join(valid_nuclei)})")
    nucleus = rgx.search(word)
    return nucleus

find_rhyming_line(current, previous_lines, orthographic=False)

Check if the current line rhymes with any of the previous lines.

Source code in src/poetry_analysis/rhyme_detection.py
209
210
211
212
213
214
215
216
217
218
def find_rhyming_line(current: Verse, previous_lines: list[Verse], orthographic: bool = False) -> tuple:
    """Check if the current line rhymes with any of the previous lines."""

    for idx, previous in reversed(list(enumerate(previous_lines))):
        if previous.last_token is None or current.last_token is None:
            continue
        rhyme_score = score_rhyme(previous.last_token, current.last_token, orthographic=orthographic)
        if rhyme_score > 0:
            return idx, rhyme_score
    return None, 0

get_stanzas_from_transcription(transcription, orthographic=False)

Parse a dict of transcribed verse lines and return a list of stanzas.

Source code in src/poetry_analysis/rhyme_detection.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def get_stanzas_from_transcription(transcription: dict, orthographic: bool = False) -> list:
    """Parse a dict of transcribed verse lines and return a list of stanzas."""
    line_ids = [x for x in transcription if x.startswith("line_")]
    n_lines = len(line_ids)
    logging.debug("Number of lines in poem: %s", n_lines)
    poem = []
    stanza = []
    for line_n in line_ids:
        verse = transcription.get(line_n)
        if (verse is not None) and (len(verse) > 0):
            words, pron = zip(*verse, strict=False)
            verseline = list(words if orthographic else pron)
            stanza.append(verseline)
        else:
            if len(stanza) == 0:
                continue
            poem.append(stanza)
            stanza = []
    if len(poem) == 0 and len(stanza) > 0:
        poem.append(stanza)
    return poem

get_valid_nuclei(orthographic=False)

Return the list of valid syllable nuclei with either graphemes or Nofabet phonemes.

Parameters:

Name Type Description Default
orthographic bool

If True, return graphemes

False
Source code in src/poetry_analysis/rhyme_detection.py
72
73
74
75
76
77
78
def get_valid_nuclei(orthographic: bool = False) -> list:
    """Return the list of valid syllable nuclei with either graphemes or Nofabet phonemes.

    Args:
        orthographic: If True, return graphemes
    """
    return utils.VALID_NUCLEI if orthographic else phonetic_inventory.PHONES_NOFABET["nuclei"]

is_nucleus(symbol, orthographic=False)

Check if a phoneme or a letter is a valid syllable nucleus.

Source code in src/poetry_analysis/rhyme_detection.py
66
67
68
69
def is_nucleus(symbol: str, orthographic: bool = False) -> bool:
    """Check if a phoneme or a letter is a valid syllable nucleus."""
    valid_nuclei = get_valid_nuclei(orthographic=orthographic)
    return strip_stress(symbol) in valid_nuclei

is_schwa(string)

Check if a string object is the schwa sound.

Source code in src/poetry_analysis/rhyme_detection.py
89
90
91
92
def is_schwa(string: str) -> bool:
    """Check if a string object is the schwa sound."""
    string = string.strip()
    return (string == "e") or (string == "AX") or (string == "AX0")

is_stressed(syllable)

Check if a syllable is stressed by searching for stress markers.

Stress markers
  • 0: Vowel/syllable nucleus without stress
  • 1: Primary stress with toneme 1
  • 2: Primary stress with toneme 2
  • 3: Secondary stress

Examples:

>>> is_stressed("a1")
True
>>> is_stressed("a0")
False
>>> is_stressed(["a", "1"])
True
>>> is_stressed(["a", "0"])
False
Source code in src/poetry_analysis/rhyme_detection.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def is_stressed(syllable: str | list) -> bool:
    """Check if a syllable is stressed by searching for stress markers.

    Stress markers:
        - `0`: Vowel/syllable nucleus without stress
        - `1`: Primary stress with toneme 1
        - `2`: Primary stress with toneme 2
        - `3`: Secondary stress

    Examples:
        >>> is_stressed("a1")
        True
        >>> is_stressed("a0")
        False
        >>> is_stressed(["a", "1"])
        True
        >>> is_stressed(["a", "0"])
        False
    """
    if isinstance(syllable, list):
        syllable = " ".join(syllable)
    result = re.search(r"[123]", syllable)
    return bool(result)

longest_common_substring(string1, string2)

Find the longest common substring between two strings.

Implementation based on the pseudocode from: https://en.wikipedia.org/wiki/Longest_common_substring#Dynamic_programming

Source code in src/poetry_analysis/rhyme_detection.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def longest_common_substring(string1: str, string2: str) -> str:
    """Find the longest common substring between two strings.

    Implementation based on the pseudocode from:
    https://en.wikipedia.org/wiki/Longest_common_substring#Dynamic_programming
    """
    m = len(string1)
    n = len(string2)
    L = np.zeros((m + 1, n + 1))
    z = 0
    result = ""

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if string1[i - 1] == string2[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
                if L[i][j] > z:
                    z = L[i][j]
                    result = string1[(i - int(z)) : i]
            else:
                L[i][j] = 0
    return result

main()

Main function to run the rhyme detection script.

Source code in src/poetry_analysis/rhyme_detection.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def main():
    """Main function to run the rhyme detection script."""
    import argparse
    from datetime import datetime

    parser = argparse.ArgumentParser(description="Tag rhyme schemes in a poem.")
    parser.add_argument(
        "-f",
        "--poemfile",
        type=Path,
        help="Path to a json file with phonemic transcriptions.",
    )
    parser.add_argument(
        "-t",
        "--doctest",
        action="store_true",
        help="Run doctests in the module.",
    )
    parser.add_argument("-v", "--verbose", action="store_true", help="Set logging level to debug.")
    args = parser.parse_args()

    if args.verbose:
        today = datetime.today().date()
        logging_file = f"{__file__.split('.')[0]}_{today}.log"
        logging.basicConfig(level=logging.DEBUG, filename=logging_file, filemode="a")

    if args.poemfile:
        tag_poem_file(args.poemfile, write_to_file=True)

    if args.doctest:
        import doctest

        logging.debug("Running doctests...")
        doctest.testmod(verbose=True)
        logging.info("Doctests passed.")

remove_syllable_onset(syllable)

Split a syllable nucleus and coda from the onset to find the rhyming part of the syllable.

Source code in src/poetry_analysis/rhyme_detection.py
 95
 96
 97
 98
 99
100
def remove_syllable_onset(syllable: list) -> list | None:
    """Split a syllable nucleus and coda from the onset to find the rhyming part of the syllable."""
    for idx, phone in enumerate(syllable):
        if is_nucleus(phone):
            return syllable[idx:]
    logging.debug("No nucleus found in %s", syllable)

score_rhyme(sequence1, sequence2, orthographic=False)

Check if two words rhyme and return a rhyming score.

Returns:

Type Description
float

1.0: Only the syllable nucleus + coda (=rhyme) match # perfect or proper rhyme

float

0.5: NØDRIM or lame rhyme. One of the words is fully contained in the other, e.g. 'tusenfryd' / 'fryd'

float

0.0: No match

Source code in src/poetry_analysis/rhyme_detection.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def score_rhyme(sequence1: str, sequence2: str, orthographic: bool = False) -> float:
    """Check if two words rhyme and return a rhyming score.

    Returns:
        `1.0`:    Only the syllable nucleus + coda (=rhyme) match # perfect or proper rhyme
        `0.5`:    NØDRIM or lame rhyme. One of the words is fully contained in the other, e.g. 'tusenfryd' / 'fryd'
        `0.0`:    No match
    """

    substring = shared_ending_substring(sequence1, sequence2)

    if not substring:
        logging.debug("No shared ending substring found in %s and %s", sequence1, sequence2)
        return 0

    nucleus = find_nucleus(substring, orthographic=orthographic)

    if not nucleus:
        logging.debug("no nucleus found in %s", substring)
        return 0
    if utils.is_grammatical_suffix(substring):
        logging.debug("only the grammatical suffixes match: %s", substring)
        # e.g. "arbeidet" / "skrevet"
        return 0
    if utils.is_grammatical_suffix(substring[nucleus.start() :]):
        logging.debug("the rhyming part is a grammatical suffix: %s", substring[nucleus.start() :])
        # e.g. "blomster" / "fester"
        return 0
    if is_schwa(substring):
        logging.debug(
            "the rhyming part is scwha (%s) and the words share no other vowels: %s",
            substring,
            (sequence1, sequence2),
        )
        return 0

    if not sequence1.endswith(substring) or not sequence2.endswith(substring):
        # not an end rhyme
        logging.debug("not an end rhyme: %s and %s", sequence1, sequence2)
        return 0
    if substring in (sequence1, sequence2):
        # one of the words is fully contained in the other
        logging.debug("Nødrim: %s and %s", sequence1, sequence2)
        return 0.5

    if nucleus and (sequence1 != sequence2):
        logging.debug("Proper rhyme: %s and %s", sequence1, sequence2)
        return 1
    # otherwise, assume that the words do not rhyme
    logging.debug("No condition met for a rhyme: %s and %s", sequence1, sequence2)
    return 0

shared_ending_substring(string1, string2)

Find the shared substring at the end of two strings.

Source code in src/poetry_analysis/rhyme_detection.py
180
181
182
183
184
185
186
187
188
def shared_ending_substring(string1: str, string2: str) -> str:
    """Find the shared substring at the end of two strings."""
    min_length = min(len(string1), len(string2))

    for i in range(1, min_length + 1):
        if string1[-i] != string2[-i]:
            final_substring = string1[-i + 1 :] if i > 1 else ""
            return final_substring
    return string1[-min_length:] if min_length > 0 else ""

strip_stress(phoneme)

Strip the stress marker from a phoneme.

Source code in src/poetry_analysis/rhyme_detection.py
61
62
63
def strip_stress(phoneme: str) -> str:
    """Strip the stress marker from a phoneme."""
    return phoneme.strip("0123")

tag_poem_file(poem_file, write_to_file=False)

Annotate rhyming schemes in a poem from a file.

Source code in src/poetry_analysis/rhyme_detection.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def tag_poem_file(poem_file: str, write_to_file: bool = False) -> list:
    """Annotate rhyming schemes in a poem from a file."""
    # Assume that the stanzas are independent of each other
    # and that the rhyme scheme is unique to each stanza

    filepath = Path(poem_file)
    file_content = filepath.read_text(encoding="utf-8")
    if filepath.suffix == ".json":
        poem = json.loads(file_content)
        poem_id = poem.get("text_id")
        orthographic = False
        stanzas = get_stanzas_from_transcription(poem, orthographic=orthographic)

    elif filepath.suffix == ".txt":
        poem_id = filepath.stem.split("_")[0]
        stanzas = utils.split_stanzas(file_content)
        orthographic = True

    logging.debug("Tagging poem: %s", poem_id)

    file_annotations = list(tag_stanzas(stanzas, orthographic=orthographic))

    if write_to_file:
        outputfile = filepath.parent / f"{filepath.stem}_rhyme_scheme.json"
        with outputfile.open("w") as f:
            f.write(json.dumps(file_annotations, ensure_ascii=False, indent=4))

        logging.debug("Saved rhyme scheme annotations for poem %s to \n\t%s", poem_id, outputfile)
    return file_annotations

tag_rhyming_verses(verses, orthographic=False)

Annotate end rhyme patterns in a poem stanza.

Parameters:

Name Type Description Default
verses list

list of verselines with words

required
orthographic bool

if True, the words strings are orthographic, otherwise assume phonemic nofabet transcriptions

False

Return: list of annotated verses with rhyme scores and rhyme tags

Source code in src/poetry_analysis/rhyme_detection.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def tag_rhyming_verses(verses: list, orthographic: bool = False) -> list:
    """Annotate end rhyme patterns in a poem stanza.

    Args:
        verses: list of verselines with words
        orthographic: if True, the words strings are orthographic,
            otherwise assume phonemic nofabet transcriptions
    Return:
        list of annotated verses with rhyme scores and rhyme tags
    """
    alphabet = iter(string.ascii_letters)

    processed = []  # needs to be a list!
    for idx, verseline in enumerate(verses):
        if not verseline:
            continue

        if orthographic:
            tokens = utils.normalize(verseline)
            last_word = find_last_word(tokens)
            if not last_word:
                logging.debug("No tokens found in %s", verseline)
                continue
            current_verse = Verse(
                id_=idx,
                text=verseline,
                tokens=tokens,
                last_token=last_word.casefold(),
            )
        else:
            syllables = utils.convert_to_syllables(verseline, ipa=False)
            last_syllable = " ".join(find_last_stressed_syllable(syllables))

            current_verse = Verse(
                id_=idx,
                transcription="\t".join(verseline),
                tokens=verseline,
                syllables=syllables,
                last_token=re.sub(r"[0123]", "", last_syllable),
            )

        rhyming_idx, rhyme_score = find_rhyming_line(current_verse, processed, orthographic=orthographic)

        if rhyming_idx is not None and rhyme_score > 0:
            rhyming_verse = processed[rhyming_idx]
            current_verse.rhyme_tag = rhyming_verse.rhyme_tag
            current_verse.rhyme_score = rhyme_score
            current_verse.rhymes_with = rhyming_verse.id_

        else:
            try:
                current_verse.rhyme_tag = next(alphabet)
            except StopIteration:
                logging.info("Ran out of rhyme tags at %s! Initialising new alphabet.", idx)
                alphabet = iter(string.ascii_letters)
                current_verse.rhyme_tag = next(alphabet)

        processed.append(current_verse)
    return processed

tag_stanzas(stanzas, orthographic=False)

Iterate over stanzas and tag verses with a rhyme scheme.

Source code in src/poetry_analysis/rhyme_detection.py
310
311
312
313
314
315
316
317
318
319
320
def tag_stanzas(stanzas: list, orthographic: bool = False) -> Generator:
    """Iterate over stanzas and tag verses with a rhyme scheme."""
    for idx, stanza in enumerate(stanzas):
        tagged = tag_rhyming_verses(stanza, orthographic=orthographic)
        rhyme_scheme = collate_rhyme_scheme(tagged)

        yield {
            "stanza_id": idx,
            "rhyme_scheme": rhyme_scheme,
            "verses": [verse.dict for verse in tagged],
        }

tag_text(text)

Annotate rhyming schemes in a text where stanzas are separated by two empty lines.

Source code in src/poetry_analysis/rhyme_detection.py
323
324
325
326
327
def tag_text(text: str) -> Generator:
    """Annotate rhyming schemes in a text where stanzas are separated by two empty lines."""
    stanzas = utils.split_stanzas(text)
    file_annotations = tag_stanzas(stanzas, orthographic=True)
    return file_annotations