#!/usr/bin/env python import string import stopwords # TODO: utf-8 complex boundaries like german/french quotes, japanese quotes... # vowels = set([ "a", "i", "u", "e", "o", "y" ]) # ^-- is that in any language? boundary = set(["<", ">", "(", ")", "{", "}", ",", ".", ";", ":", " ", "!", "/", "_", "*", "|", "\n", "\r", "\t", "\"", "=", "#", "?", "@", "%", "'", "-", "\\", "&", "`", "[", "]", "+", "^", chr(0xFF)]) semi_valid_characters = "0123456789" semi_acronyms = [ "pop3", "imap4", "p2p", "mp3", "c64", "c128", "vic20" ] # TODO plan9 # @%?&+ are html specialities # ' is questionable! # - is questionable # \ is questionable (win32) # ` to catch braindeadness # ^ to catch some stuff (not sure what) # 0xFF to catch more braindeadness max_word_bytes = 32 min_word_bytes = 3 # nat def _is_printable_word(word): # TODO support japanese/chinese if word[0] not in string.ascii_letters: # ble: return False for c in word: if ord(c) < 32: return False try: word.decode("utf-8") except: return False return True # yields (is_marked?, wort, offset) def mark_words(reader, extra_ok = ""): global max_word_bytes global min_word_bytes global words_path global semi_acronyms global semi_valid_characters global boundary offset = 0 semi_word = [] word = "" is_real_word = False is_semi_valid = False non_word = [] signaled_boundary = False in_header = 0 while True: c = reader.read(1) if c == "": # EOF break if in_header < 10: in_header = in_header + 1 if ord(c) < 32 and ord(c) != 10 and ord(c) != 13: # probably binary, so don't bother even doing anything. return if c in boundary and c not in extra_ok: if word != "" and is_real_word and len(word) < max_word_bytes: # words that are too long are just ignored in order to skip # those darn base64 encoded blocks if is_semi_valid == True: if "".join(semi_word) in semi_acronyms: word = "".join(semi_word) word = word.lower() while word.endswith("-") or word.endswith("_"): word = word[:-1] while word.startswith("-") or word.startswith("_"): word = word[1:] if word != "" and len(word) >= min_word_bytes: if _is_printable_word(word): yield True, word, offset word = "" semi_word = [] is_real_word = False is_semi_valid = False if c == "\n": yield False, "\n", offset elif c == " " or c == chr(9): # tab pass # ignore, too many or them... else: if not signaled_boundary: signaled_boundary = True if c == "*": yield False, "*", offset else: yield False, "...", offset else: # part of a word, probably signaled_boundary = False if len(word) < max_word_bytes: semi_word.append(c) if c not in semi_valid_characters: if not is_semi_valid: word = word + c else: is_semi_valid = True if (c not in string.digits) and len(word) == 1: is_real_word = True elif len(word) == max_word_bytes: word = word + "..." # cut off offset = offset + 1 def find_words(reader, extra_ok = ""): return filter_unmarked(mark_words(reader, extra_ok)) def filter_unmarked(generator): for mark, word, offset in generator: if mark == True: yield word, offset def filter_uncommon_words(marked_generator): for word, offset in marked_generator: if word not in stopwords.stopwords and len(word) > 0 and word[0] not in string.digits: yield word, offset