Skip to content

Anaphora

poetry_analysis.anaphora

Anaphora is the repetition of the same line-initial word or phrase in a verse, or across consecutive verses in a stanza.

TODO: It can also refer to the repetition of a whole stanza-initial verse line in consecutive stanzas.

NOTE: This has not been implemented yet. This anaphora detection process is based on the repetition of the first word in each line. We will continue with implementing a grading system for how effective the figure is in each poem.

count_initial_phrases(text)

Count the number of times string-initial phrases of different lengths occur in a string.

Source code in src/poetry_analysis/anaphora.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def count_initial_phrases(text: str) -> Counter:
    """Count the number of times string-initial phrases of different lengths occur in a string."""
    phrase_counts = Counter()

    lowercase = text.strip().lower()
    normalized_text = utils.strip_punctuation(lowercase)
    words = utils.tokenize(normalized_text)
    n_words = len(words)

    for n in range(1, n_words + 1):
        if len(words) >= n:
            phrase = " ".join(words[:n])
            count = normalized_text.count(phrase)
            if count > 0:
                phrase_counts[phrase] += count
    return phrase_counts

detect_repeating_lines(text)

Detect repeating lines in a poem.

Source code in src/poetry_analysis/anaphora.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def detect_repeating_lines(text: str) -> list:
    """Detect repeating lines in a poem."""
    stanzas = utils.split_stanzas(text)
    lines = [line.strip() for stanza in stanzas for line in stanza]

    repeating_lines = {}
    for idx, line in enumerate(lines):
        if line in repeating_lines:
            repeating_lines[line].append(idx)
        else:
            total = lines.count(line)
            if total > 1:
                repeating_lines[line] = [idx]

    return [(indeces, line) for line, indeces in repeating_lines.items()]

extract_anaphora(text)

Extract line-initial word sequences that are repeated at least twice.

Examples:

>>> import json
>>> text = '''
... Jeg ser paa den hvide himmel,
... jeg ser paa de graablaa skyer,
... jeg ser paa den blodige sol.
...
... Dette er altsaa verden.
... Dette er altsaa klodernes hjem.
...
... En regndraabe!
... '''
>>> result = extract_anaphora(text)
>>> print(json.dumps(result, indent=4))
{
    "1-grams": {
        "jeg": 3,
        "dette": 2
    },
    "2-grams": {
        "jeg ser": 3,
        "dette er": 2
    },
    "3-grams": {
        "jeg ser paa": 3,
        "dette er altsaa": 2
    },
    "4-grams": {
        "jeg ser paa den": 2
    }
}
Source code in src/poetry_analysis/anaphora.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def extract_anaphora(text: str) -> dict:
    """Extract line-initial word sequences that are repeated at least twice.

    Examples:
        >>> import json
        >>> text = '''
        ... Jeg ser paa den hvide himmel,
        ... jeg ser paa de graablaa skyer,
        ... jeg ser paa den blodige sol.
        ...
        ... Dette er altsaa verden.
        ... Dette er altsaa klodernes hjem.
        ...
        ... En regndraabe!
        ... '''
        >>> result = extract_anaphora(text)
        >>> print(json.dumps(result, indent=4))
        {
            "1-grams": {
                "jeg": 3,
                "dette": 2
            },
            "2-grams": {
                "jeg ser": 3,
                "dette er": 2
            },
            "3-grams": {
                "jeg ser paa": 3,
                "dette er altsaa": 2
            },
            "4-grams": {
                "jeg ser paa den": 2
            }
        }
    """
    lines = text.strip().lower().splitlines()
    ngram_counts = defaultdict(lambda: defaultdict(int))

    for line in lines:
        text = utils.strip_punctuation(line)
        words = text.split()
        n_words = len(words)
        for n in range(1, n_words + 1):
            if len(words) >= n:
                ngram = " ".join(words[:n])
                ngram_counts[n][ngram] += 1

    anaphora = {}
    for n in range(1, 5):
        ngram_type = f"{n}-grams"
        ngrams = {ngram: count for ngram, count in ngram_counts[n].items() if count > 1}
        if ngrams:
            anaphora[ngram_type] = ngrams
    return anaphora

extract_line_anaphora(text)

Extract line initial word sequences that are repeated at least twice on the same line.

Source code in src/poetry_analysis/anaphora.py
49
50
51
52
53
54
55
56
57
58
59
def extract_line_anaphora(text: str) -> list:
    """Extract line initial word sequences that are repeated at least twice on the same line."""
    anaphora = []
    lines = text.strip().splitlines()
    for i, line in enumerate(lines):
        line_initial_phrases = count_initial_phrases(line)
        phrase, count = find_longest_most_frequent_anaphora(line_initial_phrases)
        if count > 1:
            annotation = {"line_id": i, "phrase": phrase, "count": count}
            anaphora.append(annotation)
    return anaphora

extract_poem_anaphora(text)

Extract line-initial word sequences that are repeated at least twice in each stanza.

Source code in src/poetry_analysis/anaphora.py
116
117
118
119
120
121
122
123
124
125
126
127
128
def extract_poem_anaphora(text: str) -> list:
    """Extract line-initial word sequences that are repeated at least twice in each stanza."""
    anaphora = []

    stanzas = utils.split_stanzas(text)
    for i, stanza in enumerate(stanzas):
        stanza_anaphora = extract_stanza_anaphora(stanza)

        for item in filter_anaphora(stanza_anaphora):
            item["stanza_id"] = i
            anaphora.append(item)

    return anaphora

extract_stanza_anaphora(stanza, n_words=1)

Gather indeces for all lines that a line-initial word repeats across successively.

Parameters:

Name Type Description Default
n_words int

Number of words to expect in the anaphora, must be 1 or higher. If higher, a single word that is repeated more often than a phrase of n_words will be ignored in favour of the less frequent phrase.

1
Source code in src/poetry_analysis/anaphora.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def extract_stanza_anaphora(stanza: list[str], n_words: int = 1) -> dict:
    """Gather indeces for all lines that a line-initial word repeats across successively.

    Args:
        n_words: Number of words to expect in the anaphora, must be 1 or higher.
            If higher, a single word that is repeated more often than a phrase of
            n_words will be ignored in favour of the less frequent phrase.
    """
    stanza_anaphora = {}
    empty_list = []
    lines = [utils.normalize(line) if line else empty_list for line in stanza]
    for line_index, words in enumerate(lines):
        if not words:
            continue

        first_phrase = " ".join(words[:n_words])
        if line_index == 0:
            stanza_anaphora[first_phrase] = [line_index]
            continue

        previous_line = lines[line_index - 1]
        try:
            previous_first_phrase = " ".join(previous_line[:n_words])
        except IndexError:
            previous_first_phrase = None

        if line_index > 0 and previous_first_phrase == first_phrase:
            stanza_anaphora[first_phrase].append(line_index)
        else:
            stanza_anaphora[first_phrase] = [line_index]

    return stanza_anaphora

filter_anaphora(stanza_anaphora)

Construct and yield an annotation dictionary only for stanzas where anaphora are immediately successive.

Source code in src/poetry_analysis/anaphora.py
67
68
69
70
71
72
73
74
75
76
77
78
79
def filter_anaphora(stanza_anaphora: dict) -> Generator:
    """Construct and yield an annotation dictionary
    only for stanzas where anaphora are immediately successive."""
    for phrase, indeces in stanza_anaphora.items():
        if len(indeces) <= 1:
            continue
        if all(is_successive(indeces)):
            annotation = {
                "line_id": indeces,
                "phrase": phrase,
                "count": len(indeces),
            }
            yield annotation

find_longest_most_frequent_anaphora(phrases)

Find the longest and most repeated word sequence in a counter.

Source code in src/poetry_analysis/anaphora.py
36
37
38
39
40
41
42
43
44
45
46
def find_longest_most_frequent_anaphora(phrases: Counter) -> tuple:
    """Find the longest and most repeated word sequence in a counter."""
    if phrases:
        _, highest_count = phrases.most_common()[0]
        top_phrases = [phrase for phrase, _ in phrases.most_common() if phrases[phrase] == highest_count]

        longest_phrase = max(top_phrases, key=len)
        longest_count = phrases[longest_phrase]

        return longest_phrase, longest_count
    return (None, 0)

is_successive(items)

Assert whether all numbers in a list are monotonic and incremental.

Source code in src/poetry_analysis/anaphora.py
62
63
64
def is_successive(items: list[int]) -> list[bool]:
    """Assert whether all numbers in a list are monotonic and incremental."""
    return [items[i] == items[i - 1] + 1 for i, item in enumerate(items)][1:]