from collections import defaultdict import json MHYPH_FILENAME = 'mhyph.txt' OUTPUT_FILENAME = 'syllables.json' def def_value(): # MYPH has a 19 syllable long word return [0] * 19 if __name__ == '__main__': sd = defaultdict(def_value) with open(MHYPH_FILENAME, 'rb') as f: for line in f: syllables = line.rstrip().split(b'\xa5') for i, s in enumerate(syllables): s = s.lower() # Not sure this is the best way to do this # But I think it'll work skip = False for char in s: if char < 97 or char > 122: skip = True if skip: continue sd[s.decode('utf-8')][i] += 1 dictionary = {} dictionary['syllables and their frequencies per position'] = sd # Unsure qutie the best term to use here... but these are all the syllables # stored according to the position they appear in the word # also stored for each time they appear there - useful (hopefully) for # making some vaguely real sounding words because it'll kind of represent # where these syllables are used in real words. Or it might not! sbp = [ [] ] * 19 for i, s in enumerate(sd): for j, count in enumerate(sd[s]): # I have _no_ idea why this needs to be done this way vector = [s]*count if i == 0: sbp[j] = vector else: sbp[j].extend(vector) dictionary['positions and their syllables'] = sbp # I think I'll probably some "meta" values like totals # and totals per position or some stuff like that meta = {} totals = [0] * 19 for syl in sd: for i, num in enumerate(sd[syl]): totals[i] += num meta['syllable totals'] = totals meta['total syllables'] = sum(totals) dictionary['__meta__'] = meta print(dictionary['__meta__']['syllable totals']) with open(OUTPUT_FILENAME, 'w') as f: json.dump(dictionary, f, indent=4)