Skip to content

Utility functions

poetry_analysis.utils

convert_to_syllables(phonemes, ipa=False)

Turn a sequence of phonemes into syllable groups.

Source code in src/poetry_analysis/utils.py
89
90
91
92
93
94
95
96
97
98
def convert_to_syllables(phonemes: str | list, ipa: bool = False) -> list:
    """Turn a sequence of phonemes into syllable groups."""
    transcription = phonemes if isinstance(phonemes, str) else " ".join(phonemes)
    if ipa:
        ipa_str = nofabet_to_ipa(transcription)
        syllables = ipa_str.split(".")
    else:
        nofabet_syllables = nofabet_to_syllables(transcription)
        syllables = [" ".join(syll) for syll in nofabet_syllables]
    return syllables

endswith(sequence, suffix)

Check if a sequence ends with a given suffix.

Source code in src/poetry_analysis/utils.py
50
51
52
53
54
55
56
57
58
59
def endswith(sequence: str | list[str], suffix: str) -> bool:
    """Check if a sequence ends with a given suffix."""
    if isinstance(sequence, str):
        return sequence.endswith(suffix)
    elif isinstance(sequence, list):
        last_element = sequence.copy().pop()
        if isinstance(last_element, str):
            return last_element.endswith(suffix)
        return False
    return False

gather_stanza_annotations(func)

Decorator to apply a function to each stanza in a text.

Source code in src/poetry_analysis/utils.py
243
244
245
246
247
248
249
250
251
252
253
254
def gather_stanza_annotations(func) -> Callable:
    """Decorator to apply a function to each stanza in a text."""

    def wrapper(text: str) -> dict:
        stanzas = split_stanzas(text)
        stanza_annotations = {}
        for i, stanza in enumerate(stanzas, 1):
            stanza_text = "\n".join(stanza)
            stanza_annotations[f"stanza_{i}"] = func(stanza_text)
        return stanza_annotations

    return wrapper

group_consecutive_numbers(nums)

Group consecutive numbers into sublists.

Examples:

>>> list_of_numbers = [1, 2, 3, 5, 6, 8, 9, 10]
>>> result = group_consecutive_numbers(list_of_numbers)
>>> print(result)
[[1, 2, 3], [5, 6], [8, 9, 10]]
Source code in src/poetry_analysis/utils.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def group_consecutive_numbers(nums: list[int]) -> list[list[int]]:
    """Group consecutive numbers into sublists.

    Examples:
        >>> list_of_numbers = [1, 2, 3, 5, 6, 8, 9, 10]
        >>> result = group_consecutive_numbers(list_of_numbers)
        >>> print(result)
        [[1, 2, 3], [5, 6], [8, 9, 10]]
    """
    if not nums:
        return []

    nums = sorted(nums)
    result = []
    current_group = [nums[0]]

    for i in range(1, len(nums)):
        if nums[i] == nums[i - 1] + 1:
            current_group.append(nums[i])
        else:
            result.append(current_group)
            current_group = [nums[i]]

    result.append(current_group)
    return result

is_punctuation(char)

Check if a character is a punctuation mark.

Source code in src/poetry_analysis/utils.py
62
63
64
def is_punctuation(char: str) -> bool:
    """Check if a character is a punctuation mark."""
    return char in PUNCTUATION_MARKS

is_valid_onset(phonelist)

WORK IN PROGRESS Check if a sequence of characters forms a valid onset in Norwegian orthography.

Parameters:

Name Type Description Default
phonelist str

A string representing the onset (e.g., "bl", "tr").

required

Returns:

Name Type Description
bool bool

True if the onset is valid, False otherwise.

Source code in src/poetry_analysis/utils.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def is_valid_onset(phonelist: str) -> bool:
    """
    WORK IN PROGRESS
    Check if a sequence of characters forms a valid onset in Norwegian orthography.

    Args:
        phonelist (str): A string representing the onset (e.g., "bl", "tr").

    Returns:
        bool: True if the onset is valid, False otherwise.
    """
    # Define valid single consonants and consonant clusters for Norwegian
    valid_single_consonants = set("bcdfghjklmnpqrstvwxyz")
    valid_clusters = {
        "bj",
        "bl",
        "br",
        "dr",
        "dj",
        "fl",
        "fj",
        "fr",
        "gl",
        "gr",
        "gj",
        "kj",
        "kl",
        "kr",
        "kn",
        "kv",
        "pl",
        "pj",
        "pr",
        "mj",
        "nj",
        "sj",
        "sl",
        "sm",
        "sn",
        "sp",
        "st",
        "sv",
        "tr",
        "tj",
        "tl",
        "vr",
        "sk",
        "skr",
        "spr",
        "str",
        "skj",
        "gn",
        "hv",
    }

    if len(phonelist) == 1 and phonelist in valid_single_consonants:
        return True

    return phonelist in valid_clusters

make_comparable_string(item)

Convert a list of strings into a single comparable string.

Source code in src/poetry_analysis/utils.py
81
82
83
84
85
86
def make_comparable_string(item: list | str) -> str:
    """Convert a list of strings into a single comparable string."""
    string = " ".join(item) if isinstance(item, list) else str(item)
    string = strip_punctuation(string)
    string = re.sub(r"[0123]", "", string)  # remove stress markers
    return string.casefold()

normalize(text)

Lowercase, remove punctuation and tokenize a string of text.

Source code in src/poetry_analysis/utils.py
262
263
264
265
266
267
def normalize(text: str) -> list[str]:
    """Lowercase, remove punctuation and tokenize a string of text."""
    lowercase = text.strip().lower()
    alpanumeric_only = strip_punctuation(lowercase)
    words = tokenize(alpanumeric_only)
    return words

split_orthographic_text_into_syllables(words)

WORK IN PROGRESS Split orthographic text into syllables using basic rules. This is a simplified implementation and may not handle all edge cases.

Parameters:

Name Type Description Default
words list of str

A list of orthographic words, already tokenized

required

Returns:

Name Type Description
list list

A list of syllables for each word in the text.

Source code in src/poetry_analysis/utils.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def split_orthographic_text_into_syllables(words: list[str]) -> list:
    """
    WORK IN PROGRESS
    Split orthographic text into syllables using basic rules.
    This is a simplified implementation and may not handle all edge cases.

    Args:
        words (list of str): A list of orthographic words, already tokenized

    Returns:
        list: A list of syllables for each word in the text.
    """
    syllables = []

    for word in words:
        word_syllables = []
        current_syllable = ""

        for i, char in enumerate(word):
            current_syllable += char

            # Check if the character is a vowel
            if char in VALID_NUCLEI:
                # Check if the next character could be part of the same nucleus
                is_not_last = i + 1 < len(word)
                if is_not_last and word[i + 1] in VALID_NUCLEI:
                    continue

                # Check if the next character could be part of a valid onset
                if is_not_last and word[i + 1] not in VALID_NUCLEI:
                    consonant_cluster = char + word[i + 1]
                    if len(consonant_cluster) > 1 and is_valid_onset(consonant_cluster):
                        continue

                # Otherwise, split the syllable
                word_syllables.append(current_syllable)
                current_syllable = ""

        # Add any remaining characters as a syllable
        if current_syllable:
            word_syllables.append(current_syllable)

        syllables.append(word_syllables)

    return syllables

split_paragraphs(text)

Split a text into paragraphs and paragraphs into lines.

Source code in src/poetry_analysis/utils.py
230
231
232
233
234
235
236
def split_paragraphs(text: str) -> list:
    """Split a text into paragraphs and paragraphs into lines."""
    return [
        [line.rstrip() for line in paragraph.rstrip().splitlines()]
        for paragraph in re.split("\n{2,}", text)
        if paragraph
    ]

split_stanzas(text)

Split a poem into stanzas and stanzas into verses.

Source code in src/poetry_analysis/utils.py
257
258
259
def split_stanzas(text: str) -> list:
    """Split a poem into stanzas and stanzas into verses."""
    return [[verse.rstrip() for verse in stanza.rstrip().splitlines()] for stanza in re.split("\n{2,}", text) if stanza]

strip_punctuation(string)

Remove punctuation from a string

Source code in src/poetry_analysis/utils.py
72
73
74
75
76
77
78
def strip_punctuation(string: str) -> str:
    """Remove punctuation from a string"""
    alphanumstr = ""
    for char in string:
        if not is_punctuation(char):
            alphanumstr += char
    return strip_redundant_whitespace(alphanumstr)

strip_redundant_whitespace(text)

Strip redundant whitespace and reduce it to a single space.

Source code in src/poetry_analysis/utils.py
67
68
69
def strip_redundant_whitespace(text: str) -> str:
    """Strip redundant whitespace and reduce it to a single space."""
    return re.sub(r"\s+", " ", text).strip()

syllabify(transcription)

Flatten list of syllables from a list of transcribed words.

Source code in src/poetry_analysis/utils.py
101
102
103
104
105
106
107
108
def syllabify(transcription: list[list]) -> list:
    """Flatten list of syllables from a list of transcribed words."""
    syllables = [
        syll  # if syll is not None else "NONE"
        for word, pron in transcription
        for syll in convert_to_syllables(pron, ipa=False)
    ]
    return syllables