-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsyllable.py
337 lines (292 loc) Β· 9.45 KB
/
syllable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 22 10:27:40 2018
Space is limited
In a haiku, so it's hard
To finish what you
@author: tpaye
"""
# import nltk
# nltk.download('cmudict')
# nltk.download('wordnet')
from nltk.corpus import cmudict
from nltk.corpus import wordnet as wn
import pyphen
import re, string
from gensim.models import KeyedVectors
#%%
# load the Stanford GloVe model
filename = 'glove.6B.50d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print('Done loading model')
#%%
cmu_dict = cmudict.dict()
pyphen_dict = pyphen.Pyphen(lang='en')
def remove_punctuation(s):
regex = re.compile('[%s]' % re.escape(string.punctuation))
return regex.sub('', s)
# counts the syllables in a word. (might make some mistakes)
# https://stackoverflow.com/questions/405161/detecting-syllables-in-a-word
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
def nsyl(word):
word = remove_punctuation(word).lower()
if word == "":
return 0
try:
# list comprehensions make code more redable :^)
return [len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word]][0] # [0] takes the first one if the word can be pronounced in multiple ways
except KeyError:
return len(pyphen_dict.inserted(word).split('-'))
# counts number of syllables in sentence
def syl_in_words(words):
counter = 0
for word in words:
counter += nsyl(word)
return counter
# counts number of syllables in sentence
def syl_in_sentence(sentence):
sentence = remove_punctuation(sentence)
words = sentence.split()
return syl_in_words(words)
def beginning_is_n_syllables(words, n):
sub_counter = 0
for word in words:
sub_counter += nsyl(word)
if sub_counter == n:
return True
elif sub_counter > n:
return False
def words_to_sentence(ws):
s = ""
for w in ws:
s += w + " "
return s[:-1]
# A Haiku is a short poem with 17 syllables aranged in a 5-7-5 pattern
def is_haiku(sentence):
if syl_in_sentence(sentence) != 17:
return False
sentence = remove_punctuation(sentence)
words = sentence.split()
sub_counter = 0
line_counter = 0
for word in words:
sub_counter += nsyl(word)
if sub_counter == 5 + ((line_counter % 2) * 2):
sub_counter = 0
line_counter += 1
return line_counter == 3
# when you give a sentece(haiku) the function formats it
def format_haiku(sentence):
if is_haiku(sentence):
haiku = ""
if syl_in_sentence(sentence) != 17:
return False
words = sentence.split()
sub_counter = 0
for word in words:
haiku += word
sub_counter += nsyl(word)
if sub_counter == 5 or sub_counter == 12:
haiku += "\n"
else:
haiku += " "
return haiku[:-1]
else:
return "Not a Haiku: " + sentence
def cut_off(words, n):
sub_counter = 0
short_words = []
for word in words:
sub_counter += nsyl(word)
if sub_counter > n:
short_words.append(word)
return short_words
def cut_out(words, n):
sub_counter = 0
short_words = []
for word in words:
sub_counter += nsyl(word)
short_words.append(word)
if sub_counter == n:
return short_words
def get_synonyms(word, w2v):
if word in ["a", "i", "be"]:
return []
ret_syns = []
if w2v:
synonyms = model.most_similar(word, topn=10)
for synonym in synonyms:
ret_syns.append(synonym[0])
else:
for ss in wn.synsets(word):
ret_syns += ss.lemma_names()
tmp = ret_syns[:]
ret_syns = []
for e in tmp:
if '_' not in e and '.' not in e and '?' not in e:
ret_syns.append(e)
if len(ret_syns) > 10:
return ret_syns[:10]
return ret_syns
def make_length_n(words, n):
m = syl_in_words(words)
new_words = words[:]
w2v = False
while m != n:
for i, word in enumerate(words):
word = word.lower()
new_word = word
synonyms = get_synonyms(word, w2v)
for synonym in synonyms:
if n < m and nsyl(synonym) < nsyl(word):
new_word = synonym
break
elif n > m and nsyl(synonym) > nsyl(word):
new_word = synonym
break
new_words[i] = new_word
if new_word != word:
m += nsyl(new_word) - nsyl(word)
if words == new_words:
if w2v:
return False
else:
# fall back to word2vec
w2v = True
words = new_words[:]
return new_words
def cut_half(ws):
first_half = []
second_half = []
n = syl_in_words(ws)
m = 0
for w in ws:
m += nsyl(w)
if m > n/2:
second_half.append(w)
else:
first_half.append(w)
return first_half, second_half
def cut_thirds(ws):
first_third = []
second_third = []
third_third = []
n = syl_in_words(ws)
m = 0
for w in ws:
m += nsyl(w)
if m < n/3:
first_third.append(w)
elif m > 2*(n/3):
third_third.append(w)
else:
second_third.append(w)
return first_third, second_third, third_third
def modify_words(words, b, m, e):
bws = []
mws = []
ews = []
if b and m:
ews = make_length_n(cut_off(words, 12), 5)
bws = cut_out(words, 12)
elif e and m:
bws = make_length_n((cut_off(reversed(words), 12))[::-1], 5)
ews = (cut_out(reversed(words), 12))[::-1]
elif b and e:
mws = make_length_n(cut_off(reversed(cut_off(reversed(words), 5)), 5), 7)
bws = cut_out(words, 5)
ews = (cut_out(reversed(words), 5))[::-1]
elif b:
mws, ews = cut_half(cut_off(words, 5))
mws = make_length_n(mws, 7)
ews = make_length_n(ews, 5)
bws = cut_out(words, 5)
elif e:
mws, bws = cut_half(cut_off(reversed(words), 5))
bws = bws[::-1]
mws = mws[::-1]
bws = make_length_n(bws, 5)
mws = make_length_n(mws, 7)
ews = (cut_out(reversed(words), 5))[::-1]
else:
bws, mws, ews = cut_thirds(words)
bws = make_length_n(bws, 5)
mws = make_length_n(mws, 7)
ews = make_length_n(ews, 5)
if bws is False or mws is False or ews is False:
return ["Alas, it seems I can't make a haiku out of what you have written."]
return bws + mws + ews
def generate_haiku(sentence):
if is_haiku(sentence):
return format_haiku(sentence)
# gatekeeping
if syl_in_sentence(sentence) in range(13, 22):
end_okay, middle_okay, beginning_okay = False, False, False
sentence = remove_punctuation(sentence)
words = sentence.split()
beginning_okay = beginning_is_n_syllables(words, 5)
end_okay = beginning_is_n_syllables(reversed(words), 5)
if not (end_okay and beginning_okay):
if beginning_okay == end_okay:
#check for middle?
pass
elif end_okay:
middle_okay = beginning_is_n_syllables(reversed(words), 12)
else:
middle_okay = beginning_is_n_syllables(words, 12)
# else: middle_okay = False
words = modify_words(words, beginning_okay, middle_okay, end_okay)
return format_haiku(words_to_sentence(words))
elif syl_in_sentence(sentence) < 13:
return format_haiku("The text you wrote me seems excessively short for making a haiku")
else:
return format_haiku("The text you wrote me seems excessively long for making a haiku")
def nvowels(word):
ys = 0
for i, l in enumerate(word):
if l == 'y':
front = False
if i != 0:
if word[i-1] not in "aeiou":
front = True
if front and i < len(word):
if word[i-1] not in "aeiou":
ys += 1
return ys + sum(map(word.lower().count, "aeiou"))
def clappify(sentence):
clapped = ""
for word in sentence.split(' '):
if nsyl(word) == nvowels(word):
# every vowel-letter is a vowel-sound
# we can clap that
clapped_word = ""
claps = 0
for i, l in enumerate(word):
clapped_word += l
if l in "aeiou":
if i < len(word):
if claps < nvowels(word)-1:
clapped_word += "π"
claps += 1
elif l == 'y':
# check whether y is a vowel
front = False
if i != 0:
if word[i-1] not in "aeiou":
front = True
if front and i < len(word):
if word[i-1] not in "aeiou":
# y is a vowel
if claps < nvowels(word)-1:
clapped_word += "π"
claps += 1
clapped += clapped_word + "π"
else:
for syl in pyphen_dict.inserted(word).split('-'):
clapped += syl + "π"
clapped += " "
return clapped[:-1]
def main():
pass
if __name__ == '__main__':
main()