/**
* HTML tag extractor
* @author Robert J Morton
* @version 04 September 2000
* @copyright Sep 2000 Robert J Morton (all rights reserved) */
/* This program scans an incoming character stream. It returns each character
which is not part of an HTML tag unchanged. It returns a null character in
place of each character which is part of an HTML tag. It returns a null
character in place of each character it encounters OUTSIDE the
tags, and in place of each character it encounters INSIDE the
tag-pairs shown in the Tags[] array below. */
class HTMLtag {
// HTML tags between which words are not to be counted
private String Tags[] = {"table","applet","form"};
private int
T = Tags.length, // number of entries in the Tags[] array
t = 0; // tag index number in Tags[]
private boolean
inTag = false, // true when inside an HTML tag
betweenTags = false, // true when between one of the above tag-pairs
inBody = false; // true when between the document's tags
private String
Tag = ""; // tag accumulator string
/* FILTER OUT HTML TAGS ACCORDING TO STRICT XML RULES
Called from only one place in wordCnt.count(). */
char capture(char c) {
if(inTag) { // if inside an HTML tag
if(c == '>') // If char is a tag-end,
inTag = false; // reset to 'outside a tag'
/* Else, if it is any other character, add it to what we have captured
so far and rationalise it to lower case for comparison as foillows: */
else {
Tag += c;
String tag = Tag.toLowerCase();
/* If currently within the of the HTML document,
if what we have so far accumulated is the tag,
set to indicate we are no longer in the document body. */
if(inBody) {
if(tag.equals("/body"))
inBody = false;
}
/* Otherwise, if we are currently outside the document
then if what we have so far accumulated is the tag, in-
dicate we are now between the and tags. */
else {
if(tag.equals("body"))
inBody = true;
}
/* If we are currently between a prescribed tag pair,
and we have accumulated this tag's end-tag, indicate
we are no longer between this tag and its end tag. */
if(betweenTags && tag.equals("/" + Tags[t]))
betweenTags = false;
/* else if we are not between a prescribed tag pair, then
for each type of tag being sought: if what we have so far
accumulated is this tag's name shows that we are between
that tag and its corresponding end tag, note which tag-pair
we are now between, clear the tag name accumulator string. */
else
for(int i = 0; i < T; i++)
if(tag.equals(Tags[i])) {
betweenTags = true;
t = i;
break;
}
/* If what we have so far accumulated is the 'br' of a
tag, return a space character to ensure word separation. */
if(tag.equals("br"))
return ' ';
}
c = 0; // return a null character
}
/* Else we are not in a tag, so if char is a tag-start, set to 'inside
a tag', clear the tag accumulator and return a null character. */
else if(c == '<') {
inTag = true;
Tag = "";
c = 0;
}
/* If we are outside the document or between certain
prescribed tag-pairs, return a null character. */
if(!inBody || betweenTags)
c = 0;
return c;
}
}