mini_projects/random_word_generator/process_mhyph.py

from collections import defaultdict
import json

MHYPH_FILENAME = 'mhyph.txt'
OUTPUT_FILENAME = 'syllables.json'

def def_value():
    # MYPH has a 19 syllable long word
    return [0] * 19

if __name__ == '__main__':
    sd = defaultdict(def_value)
    with open(MHYPH_FILENAME, 'rb') as f:
        for line in f:
            syllables = line.rstrip().split(b'\xa5')
            for i, s in enumerate(syllables):
                s = s.lower()
                # Not sure this is the best way to do this
                # But I think it'll work
                skip = False
                for char in s:
                    if char < 97 or char > 122:
                        skip = True
                if skip:
                    continue
                sd[s.decode('utf-8')][i] += 1

    dictionary = {}
    dictionary['syllables and their frequencies per position'] = sd

    # Unsure qutie the best term to use here... but these are all the syllables
    #  stored according to the position they appear in the word
    #  also stored for each time they appear there - useful (hopefully) for
    #  making some vaguely real sounding words because it'll kind of represent
    #  where these syllables are used in real words. Or it might not!
    sbp = [ [] ] * 19
    for i, s in enumerate(sd):
        for j, count in enumerate(sd[s]):
            # I have _no_ idea why this needs to be done this way
            vector = [s]*count
            if i == 0:
                sbp[j] = vector
            else:
                sbp[j].extend(vector)

    dictionary['positions and their syllables'] = sbp

    # I think I'll probably some "meta" values like totals
    # and totals per position or some stuff like that
    meta = {}
    totals = [0] * 19
    for syl in sd:
        for i, num in enumerate(sd[syl]):
            totals[i] += num

    meta['syllable totals'] = totals
    meta['total syllables'] = sum(totals)
    dictionary['__meta__'] = meta
    print(dictionary['__meta__']['syllable totals'])

    with open(OUTPUT_FILENAME, 'w') as f:
        json.dump(dictionary, f, indent=4)