/** * Character entity substituter for the text/HTML word counter. * @author Robert J Morton * @version 04 September 2000 * @copyright Sep 2000 Robert J Morton (all rights reserved) */ /* Converts a presented HTML character entity string into an UDT-8 character byte code. */ class charEnt { private static final String S[] = { //valid HTML character entities "aacute", "Aacute", "acirc", "Acirc", "acute", "aelig", "AElig", "agrave", "Agrave", "alpha", "amp", "approx", "aring", "Aring", "asymp", "atilde", "Atilde", "auml", "Auml", "brvbar", "ccedil", "Ccedil", "cedil", "cent", "cong", "copy", "curren", "dagger", "Dagger", "deg", "delta", "div", "divide", "eacute", "Eacute", "ecirc", "Ecirc", "egrave", "Egrave", "epsilon", "equiv", "eth", "ETH", "euml", "Euml", "frac12", "frac13", "frac14", "frac23;", "frac34", "ge", "gt", "iacute", "Iacute", "icirc", "Icirc", "iexcl", "igrave", "Igrave", "int;", "iquest","iuml", "Iuml", "laquo", "lt", "macr", "micro", "middot", "minus", "mu", "nbsp", "ne", "not", "ntilde", "Ntilde", "oacute", "Oacute", "ocirc", "Ocirc", "ograve", "Ograve", "omega", "ordf", "ordm", "oslash", "Oslash", "otilde", "Otilde", "ouml", "Ouml", "para", "part", "pi", "plusmn", "pound", "quot", "radic", "raquo", "reg", "rho", "sect", "shy", "sup1", "sup2", "sup3", "szlig", "theta", "thorn", "THORN", "times", "uacute", "Uacute", "ucirc", "Ucirc", "ugrave", "Ugrave", "uml", "uuml", "Uuml", "yacute", "Yacute", "yen", "yuml" }; private static final char T[] = { // HTML numeric entities 34, 38, 60, 62, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 937, 969, 8364, 8470, 9673, 9676 }; private static int n; // index number of the character's ASCII code static boolean CEE = false; // no character entirt error yet /* ACCEPTS THE GIVEN CHARACTER ENTITY Called from only one place in getChar(). */ private static boolean find(String s) { int x, // 3-state string comparison variable N = S.length - 1; // number of highest entry n = N >> 1; // start with the char ent half way up the index int j = n; // jump size /* While the jump size, having been halved, be > zero: if s > retrieved keyword, we are too low down the index, so split the partition. Else, if s < retrieved keyword, we are too far up the index, so split the partition. Otherwise, s = retrieved keyword, so return TRUE. */ while((j >>= 1) > 0) { if((x = s.compareTo(S[n])) > 0) n += j; else if(x < 0) n -= j; else return true; } boolean u = false, // true indicates going up! d = false; // true indicates going down! while(!(u && d)) { // while not yet reversed direction along index /* If s > retrieved keyword, we are too high up the index, so if we are still moving up, or have overshot the top of it, break out of the while() loop with 'n' pointing to the next higher word than w. Otherwise, set the 'moving up' flag and continue in the loop. */ if((x = s.compareTo(S[n])) > 0) { if(d) break; if(++n > N) break; u = true; } /* Else, if s < retrieved keyword, we are too low down the index, so if we are moving down the index or moving down has caused us to overshoot the beginning of the index, break out of the while() loop with 'n' pointing to the next lower word than w. */ else if(x < 0) { if(--n < 0) break; if(u) break; d = true; } else // Else submitted keyword word matched the re- return true; // trieved keyword so return its index number. } return false; // keyword cannot be found so return false } /* If the presented character entity is found in the array, return its ASCII value, else return a ^, hoping it will be sufficiently out of context to indicate an invalid character entity. Called only from one place in capture(). */ private static char getChar(String s) { if(find(s)) return T[n]; else return '^'; } static int a = 0, // character entity capture phase Ymax = 7; // max number of chars in character entity // string in which to accumulate the characters of an entity static String Y = ""; /* CAPTURE A CHARACTER ENTITY Called from only one place in wordCnt.count(). */ static char capture(char c) { CEE = false; // clear the character entity error flag /* If we are not currently inside a character entity, it is an ordinary character so return it. If it's an ampersand, set phase 1 of capture, clear character entity capture string, else return a null character. */ if(a == 0) { if(c != '&') return c; a = 1; Y = ""; c = 0; } /* If we've just captured an ampersand, THEN IF this character is a hash, it is a numerical char ent, so set the 'end-stop' to one more than the 3 permitted numerics, ELSE it is an acronymal character entity, which can have up to 6 characters. place the acronym's first chararacter in chararacter-entity capture string. */ else if(a == 1) { if(c == '#') Ymax = 4; else { Ymax = 7; Y += c; } a = 2; // Whatever the case, set to phase 2 of capture c = 0; // and return a NULL character. } else if(c == ';') { // if this character is a terminating semi-colon a = 0; // reset to non-capture phase /* Then try to parse the captured string as a number. If it won't parse as a numeric UDT-8 character code, it must be in abbreviation form, so look up its numeric UDT-8 character code. */ try { c = (char)Integer.valueOf(Y).intValue(); } catch(NumberFormatException e) { c = getChar(Y); } } /* Else we are still gathering the entity's characters, so provided the character-entity is still not too long, add this character to the capture string and return a null character. Otherwise we now have too many characters, so indicate a character entity error, return to normal character mode and set the character entity error flag. */ else { if(Y.length() < Ymax) { Y += c; c = 0; } else { c = '^'; a = 0; CEE = true; } } return c; // return the appropriate character } /* To make it possible for the wordCnt class to read the state of the 'Character Entity Encountered' flag. Called from only one place in wordCnt.count(). */ static boolean getCEE() { return CEE; } }