#!/usr/bin/env python import sys import StringIO import string import exceptions def parse_Unihan(input_file, dictionary): entries = [line.strip().split("\t") for line in input_file.readlines() if line.strip() != "" and line.strip()[0] != "#"] for entry in entries: assert(entry[0].startswith("U+")) code = int(entry[0][2 : ], 16) key = entry[1] value = entry[2] if code not in dictionary: dictionary[code] = [] dictionary[code].append((key, value)) def get_value(entry, key): for x_key, x_value in entry: if x_key == key: return x_value return None def escape(text): if text: return text.replace("<", "<").replace(">", ">") else: return "" vowels = ["A", "I", "U", "E", "O"] unromaji_table = { # TODO JI => ZI; or DI; 3063 small tsu. TSU. small YA=3084, SMALL YU=3085, SMALL YO=3087 # TODO "N"=3093; VU=3094; TI=CHI, DI=JI # TODO difference HU/FU None: [u"\u3042", u"\u3044", u"\u3046", u"\u3048", u"\u304A", ], "W": [u"\u308F", None, None, None, u"\u3092", ], "R": [u"\u3089", u"\u308A", u"\u308B", u"\u308C", u"\u308D", ], "Y": [u"\u3084", None, u"\u3086", None, u"\u3088", ], "M": [u"\u307E", u"\u307F", u"\u3080", u"\u3081", u"\u3082", ], "H": [u"\u306F", u"\u3072", u"\u3075", u"\u3078", u"\u307B", ], "F": [None, None, u"\u3075", None, None ], "N": [u"\u306A", u"\u306B", u"\u306C", u"\u306D", u"\u306E", ], "T": [u"\u305F", u"\u3061", u"\u3064", u"\u3066", u"\u3068", ], "S": [u"\u3055", u"\u3057", u"\u3059", u"\u305B", u"\u305D", ], "K": [u"\u304B", u"\u304D", u"\u304F", u"\u3051", u"\u3053", ], "B": [u"\u306F", u"\u3073", u"\u3076", u"\u3079", u"\u307C", ], "D": [u"\u3060", u"\u3062", u"\u3065", u"\u3067", u"\u3069", ], "Z": [u"\u3056", u"\u3058", u"\u305A", u"\u305C", u"\u305E", ], "G": [u"\u304C", u"\u304E", u"\u3050", u"\u3052", u"\u3054", ], "P": [u"\u3071", u"\u3074", u"\u3077", u"\u307A", u"\u307D", ], "y": [u"\u3084", None, u"\u3085", None, u"\u3087", ], "tiny":[u"\u3041", u"\u3043", u"\u3045", u"\u3047", u"\u3049", ], } # KYA = KI & YA # NYUUMU def unromaji_2(text): # FIXME handle N/NI/NYA if text is None: return text result = StringIO.StringIO() for c in text: assert(c in (string.letters + " ")) count = 0 half = None previous_c = None for c in text: if c == " ": result.write(" ") # TODO remove? previous_c = c continue #print >>sys.stderr, "CC", count, c if count == 0: #if c == "N": # result.write("&#x%x;" % 0x3093) if c in vowels: i = vowels.index(c) assert(i != -1) half = unromaji_table[None] code = ord(half[i]) result.write("&#x%x;" % code) else: half_code = c if c == "J": c = "Z" elif c == "C": c = "T" half = unromaji_table[c] count = count + 1 elif count == 1: if c == previous_c: # PPA # TODO limit result.write("&#x%x;" % 0x3063) count = 0 half_code = c half = unromaji_table[c if c != "J" else "Z"] count = count + 1 previous_c = c continue if c == "S" and half_code == "T": # FIXME not nice. previous_c = c continue if c in ["H", "Y"]: i = vowels.index("I") assert(i != -1) code = ord(half[i]) result.write("&#x%x;" % code) count = count + 1 previous_c = c continue try: i = vowels.index(c) except exceptions.ValueError: if half_code == "N": result.write("&#x%x;" % 0x3093) count = 0 half_code = c if c == "J": c = "Z" elif c == "C": c = "T" half = unromaji_table[c] count = count + 1 previous_c = c continue else: raise assert(i != -1) code = half[i] #print >>sys.stderr, "CODE", half, half_code if code is None and half_code == "F": # foreign result.write("F") # TODO katakana? result.write(c) else: code = ord(code) result.write("&#x%x;" % code) count = 0 elif count == 2: if c != "I": i = vowels.index(c) code = unromaji_table["y"][i] result.write("&#x%x;" % ord(code)) count = 0 previous_c = c # handle trailing "N": if count == 1: result.write("&#x%x;" % ord(half_code)) return result.getvalue() def unspace(text): if text is not None: return text.replace(" ", "|") else: return text def unromaji(text): try: return unromaji_2(text) except: print >>sys.stderr, "broken:", text return text def get_Japanese_entries(dictionary): for code in range(0x4e00, 0x9faf + 1): if code >= 0x9fa6 and code <= 0x9faf: # FIXME ??? continue dictionary_entry = dictionary[code] JapaneseKun = get_value(dictionary_entry, "kJapaneseKun") JapaneseOn = get_value(dictionary_entry, "kJapaneseOn") Definition = get_value(dictionary_entry, "kDefinition") if JapaneseKun or JapaneseOn: yield dictionary_entry, """ &#x%x;%s %s %s %%s""" % (code, code, escape(Definition), escape(unspace(JapaneseKun)), unspace(unromaji(JapaneseKun)) or "", escape(unspace(JapaneseOn))) def group_by_Grade(entries): entries = [entry for entry in entries] grades = set([int(get_value(dictionary_entry, "kGradeLevel") or 99) for dictionary_entry, HTML in entries]) result = [] for grade in sorted([grade for grade in grades]): g_entries = [(dictionary_entry, HTML) for dictionary_entry, HTML in entries if int(get_value(dictionary_entry, "kGradeLevel") or 99) == grade] result.append((grade, g_entries)) return sorted(result) dictionary = {} parse_Unihan(file("Unihan/Unihan_DictionaryLikeData.txt", "r"), dictionary) parse_Unihan(file("Unihan/Unihan_Readings.txt", "r"), dictionary) sys.stdout.write(file("top.txt", "r").read()) for grade, entries in group_by_Grade(get_Japanese_entries(dictionary)): print "" print "

Grade %s

" % grade print "" for dictionary_entry, HTML in entries: TotalStrokes = get_value(dictionary_entry, "kTotalStrokes") GradeLevel = get_value(dictionary_entry, "kGradeLevel") extra = ",%s strokes" % TotalStrokes print HTML % extra sys.stdout.write(file("bottom.txt", "r").read()) # FIXME order by frequency desc.