63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
from collections import defaultdict
|
|
import json
|
|
|
|
MHYPH_FILENAME = 'mhyph.txt'
|
|
OUTPUT_FILENAME = 'syllables.json'
|
|
|
|
def def_value():
|
|
# MYPH has a 19 syllable long word
|
|
return [0] * 19
|
|
|
|
if __name__ == '__main__':
|
|
sd = defaultdict(def_value)
|
|
with open(MHYPH_FILENAME, 'rb') as f:
|
|
for line in f:
|
|
syllables = line.rstrip().split(b'\xa5')
|
|
for i, s in enumerate(syllables):
|
|
s = s.lower()
|
|
# Not sure this is the best way to do this
|
|
# But I think it'll work
|
|
skip = False
|
|
for char in s:
|
|
if char < 97 or char > 122:
|
|
skip = True
|
|
if skip:
|
|
continue
|
|
sd[s.decode('utf-8')][i] += 1
|
|
|
|
dictionary = {}
|
|
dictionary['syllables and their frequencies per position'] = sd
|
|
|
|
# Unsure qutie the best term to use here... but these are all the syllables
|
|
# stored according to the position they appear in the word
|
|
# also stored for each time they appear there - useful (hopefully) for
|
|
# making some vaguely real sounding words because it'll kind of represent
|
|
# where these syllables are used in real words. Or it might not!
|
|
sbp = [ [] ] * 19
|
|
for i, s in enumerate(sd):
|
|
for j, count in enumerate(sd[s]):
|
|
# I have _no_ idea why this needs to be done this way
|
|
vector = [s]*count
|
|
if i == 0:
|
|
sbp[j] = vector
|
|
else:
|
|
sbp[j].extend(vector)
|
|
|
|
dictionary['positions and their syllables'] = sbp
|
|
|
|
# I think I'll probably some "meta" values like totals
|
|
# and totals per position or some stuff like that
|
|
meta = {}
|
|
totals = [0] * 19
|
|
for syl in sd:
|
|
for i, num in enumerate(sd[syl]):
|
|
totals[i] += num
|
|
|
|
meta['syllable totals'] = totals
|
|
meta['total syllables'] = sum(totals)
|
|
dictionary['__meta__'] = meta
|
|
print(dictionary['__meta__']['syllable totals'])
|
|
|
|
with open(OUTPUT_FILENAME, 'w') as f:
|
|
json.dump(dictionary, f, indent=4)
|