/** 
  * Character entity substituter for the text/HTML word counter.
  * @author Robert J Morton <rob@robmorton.20m.com>
  * @version 04 September 2000
  * @copyright Sep 2000 Robert J Morton (all rights reserved) */

/* Converts a presented HTML character entity string into an UDT-8 character
   byte code. */

class charEnt {
  private static final String S[] = { //valid HTML character entities
    "aacute", "Aacute", "acirc", "Acirc", "acute", "aelig", "AElig",
    "agrave", "Agrave", "alpha", "amp", "approx", "aring", "Aring", "asymp",  
    "atilde", "Atilde", "auml", "Auml", "brvbar", "ccedil", "Ccedil",
    "cedil", "cent", "cong", "copy", "curren", "dagger", "Dagger", "deg",     
    "delta", "div", "divide", "eacute", "Eacute", "ecirc", "Ecirc", "egrave", 
    "Egrave", "epsilon", "equiv", "eth", "ETH", "euml", "Euml", "frac12",
    "frac13", "frac14", "frac23;", "frac34", "ge", "gt", "iacute", "Iacute", 
    "icirc", "Icirc", "iexcl", "igrave", "Igrave", "int;", "iquest","iuml",
    "Iuml", "laquo", "lt", "macr", "micro", "middot", "minus", "mu", "nbsp",
    "ne", "not", "ntilde", "Ntilde", "oacute", "Oacute", "ocirc", "Ocirc",
    "ograve", "Ograve", "omega", "ordf", "ordm", "oslash", "Oslash",
    "otilde", "Otilde", "ouml", "Ouml", "para", "part", "pi", "plusmn", 
    "pound", "quot", "radic", "raquo", "reg", "rho", "sect", "shy", "sup1",   
    "sup2", "sup3", "szlig", "theta", "thorn", "THORN", "times", "uacute", 
    "Uacute", "ucirc", "Ucirc", "ugrave", "Ugrave", "uml", "uuml", "Uuml",
    "yacute", "Yacute", "yen", "yuml" 
  };

  private static final char T[] = { // HTML numeric entities
    34, 38, 60, 62, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
    171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
    185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
    199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
    213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226,
    227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240,
    241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
    255, 937, 969, 8364, 8470, 9673, 9676
  };   
   
  private static int n;        // index number of the character's ASCII code
  static boolean CEE = false;  // no character entirt error yet




  /* ACCEPTS THE GIVEN CHARACTER ENTITY
  Called from only one place in getChar(). */

  private static boolean find(String s) {

    int
      x,                 // 3-state string comparison variable
      N = S.length - 1;  // number of highest entry
    n = N >> 1;          // start with the char ent half way up the index
    int j = n;           // jump size

    /* While the jump size, having been halved, be > zero: if s > retrieved
    keyword, we are too low down the index, so split the partition. Else,
    if s < retrieved keyword, we are too far up the index, so split the
    partition. Otherwise, s = retrieved keyword, so return TRUE. */

    while((j >>= 1) > 0) {
      if((x = s.compareTo(S[n])) > 0)
        n += j;
      else if(x < 0)
        n -= j;
      else
        return true;
    }
    boolean 
      u = false,  // true indicates going up!
      d = false;  // true indicates going down!

    while(!(u && d)) {  // while not yet reversed direction along index

      /* If s > retrieved keyword, we are too high up the index, so if we
      are still moving up, or have overshot the top of it, break out of
      the while() loop with 'n' pointing to the next higher word than w.
      Otherwise, set the 'moving up' flag and continue in the loop. */

      if((x = s.compareTo(S[n])) > 0) {
        if(d)
          break;
        if(++n > N)
          break;
        u = true;
      }

      /* Else, if s < retrieved keyword, we are too low down the index, so
      if we are moving down the index or moving down has caused us to
      overshoot the beginning of the index, break out of the while()
      loop with 'n' pointing to the next lower word than w. */

      else
      if(x < 0) {
        if(--n < 0)
          break;
        if(u)
          break;
        d = true;
      } 
      else            // Else submitted keyword word matched the re-
        return true;  // trieved keyword so return its index number.
    }
    return false;     // keyword cannot be found so return false
  }




  /* If the presented character entity is found in the array, return its
  ASCII value, else return a ^, hoping it will be sufficiently out of
  context to indicate an invalid character entity.
  Called only from one place in capture(). */

  private static char getChar(String s) {
    if(find(s))
      return T[n];
    else
      return '^';
  }




  static int
    a = 0,     // character entity capture phase
    Ymax = 7;  // max number of chars in character entity

  // string in which to accumulate the characters of an entity
  static String Y = "";




  /* CAPTURE A CHARACTER ENTITY
  Called from only one place in wordCnt.count(). */

  static char capture(char c) {
    CEE = false;  // clear the character entity error flag

    /* If we are not currently inside a character entity, it is an ordinary
    character so return it. If it's an ampersand, set phase 1 of capture,
    clear character entity capture string, else return a null character. */

    if(a == 0) {
      if(c != '&')
        return c;
      a = 1;
      Y = "";
      c = 0;
    } 

    /* If we've just captured an ampersand, THEN IF this character is a
    hash, it is a numerical char ent, so set the 'end-stop' to one more
    than the 3 permitted numerics, ELSE it is an acronymal character
    entity, which can have up to 6 characters. place the acronym's first
    chararacter in chararacter-entity capture string. */

    else if(a == 1) {
      if(c == '#')
        Ymax = 4;
      else {
        Ymax = 7;
        Y += c;
      }
      a = 2;  // Whatever the case, set to phase 2 of capture
      c = 0;  // and return a NULL character.
    } 

    else if(c == ';') {  // if this character is a terminating semi-colon
      a = 0;             // reset to non-capture phase

      /* Then try to parse the captured string as a number. If it won't
      parse as a numeric UDT-8 character code, it must be in abbreviation
      form, so look up its numeric UDT-8 character code. */

      try {
        c = (char)Integer.valueOf(Y).intValue();
      } 
      catch(NumberFormatException e) {
        c = getChar(Y);
      }
    } 

    /* Else we are still gathering the entity's characters, so provided the
    character-entity is still not too long, add this character to the
    capture string and return a null character. Otherwise we now have too
    many characters, so indicate a character entity error, return to
    normal character mode and set the character entity error flag. */

    else {
      if(Y.length() < Ymax) {
        Y += c;
        c =  0;
      } 
      else {
        c = '^';
        a = 0;
        CEE = true;
      }
    }

    return c;  // return the appropriate character
  }




  /* To make it possible for the wordCnt class to read the
  state of the 'Character Entity Encountered' flag.
  Called from only one place in wordCnt.count(). */

  static boolean getCEE() { return CEE; } 
}