#!/usr/bin/env python

import string
import stopwords

# TODO: utf-8 complex boundaries like german/french quotes, japanese quotes...

# vowels = set([ "a", "i", "u", "e", "o", "y" ])
# ^-- is that in any language?

boundary = set(["<", ">", "(", ")", "{", "}", ",", ".", ";", ":", " ", 
                "!", "/", "_", "*", "|", "\n", "\r", "\t", "\"", "=", 
                "#", "?", "@", "%", "'", "-", "\\", "&", "`", "[", "]",
                "+", "^", chr(0xFF)])
                
semi_valid_characters = "0123456789"
semi_acronyms = [ "pop3", "imap4", "p2p", "mp3", "c64", "c128", "vic20" ]
# TODO plan9

# @%?&+ are html specialities
# ' is questionable! 
# - is questionable
# \ is questionable (win32)
# ` to catch braindeadness
# ^ to catch some stuff (not sure what)
# 0xFF to catch more braindeadness

max_word_bytes = 32
min_word_bytes = 3 # nat

def _is_printable_word(word):
  # TODO support japanese/chinese
  
  if word[0] not in string.ascii_letters: # ble:
    return False
    
  for c in word:
    if ord(c) < 32:
      return False
  
  try:
    word.decode("utf-8")
  except:
    return False
      
  return True

# yields (is_marked?, wort, offset)
def mark_words(reader, extra_ok = ""):
  global max_word_bytes
  global min_word_bytes
  global words_path
  global semi_acronyms
  global semi_valid_characters
  global boundary
  
  offset = 0
  
  semi_word = []
  word = ""
  is_real_word = False
  is_semi_valid = False
  non_word = []
  signaled_boundary = False

  in_header = 0  
  while True: 
    c = reader.read(1)
    if c == "": # EOF
      break

    if in_header < 10:
      in_header = in_header + 1
      if ord(c) < 32 and ord(c) != 10 and ord(c) != 13: 
        # probably binary, so don't bother even doing anything.
        return
        
    if c in boundary and c not in extra_ok:
      if word != "" and is_real_word and len(word) < max_word_bytes:
        # words that are too long are just ignored in order to skip
        # those darn base64 encoded blocks
        
        if is_semi_valid == True:
          if "".join(semi_word) in semi_acronyms: 
            word = "".join(semi_word)
            
        word = word.lower()
        
        while word.endswith("-") or word.endswith("_"):
          word = word[:-1]

        while word.startswith("-") or word.startswith("_"):
          word = word[1:]
      
        if word != "" and len(word) >= min_word_bytes:
          if _is_printable_word(word):
            yield True, word, offset

    
      word = ""
      semi_word = []
      is_real_word = False
      is_semi_valid = False

      if c == "\n":
        yield False, "\n", offset
      elif c == " " or c == chr(9): # tab
        pass # ignore, too many or them...
      else:
        if not signaled_boundary:
          signaled_boundary = True
          
          if c == "*":
            yield False, "*", offset
          else:
            yield False, "...", offset
        

    else: # part of a word, probably
      signaled_boundary = False
      if len(word) < max_word_bytes:
        semi_word.append(c)
        
        if c not in semi_valid_characters:
          if not is_semi_valid:
            word = word + c
        else:
          is_semi_valid = True
          
        if (c not in string.digits) and len(word) == 1:
          is_real_word = True
      elif len(word) == max_word_bytes:
        word = word + "..." # cut off

    offset = offset + 1

def find_words(reader, extra_ok = ""):
  return filter_unmarked(mark_words(reader, extra_ok))

def filter_unmarked(generator):
  for mark, word, offset in generator:
    if mark == True:
      yield word, offset
  
def filter_uncommon_words(marked_generator):
  for word, offset in marked_generator:
    if word not in stopwords.stopwords and len(word) > 0 and word[0] not in string.digits:
      yield word, offset