/** * HTML tag extractor * @author Robert J Morton * @version 04 September 2000 * @copyright Sep 2000 Robert J Morton (all rights reserved) */ /* This program scans an incoming character stream. It returns each character which is not part of an HTML tag unchanged. It returns a null character in place of each character which is part of an HTML tag. It returns a null character in place of each character it encounters OUTSIDE the tags, and in place of each character it encounters INSIDE the tag-pairs shown in the Tags[] array below. */ class HTMLtag { // HTML tags between which words are not to be counted private String Tags[] = {"table","applet","form"}; private int T = Tags.length, // number of entries in the Tags[] array t = 0; // tag index number in Tags[] private boolean inTag = false, // true when inside an HTML tag betweenTags = false, // true when between one of the above tag-pairs inBody = false; // true when between the document's tags private String Tag = ""; // tag accumulator string /* FILTER OUT HTML TAGS ACCORDING TO STRICT XML RULES Called from only one place in wordCnt.count(). */ char capture(char c) { if(inTag) { // if inside an HTML tag if(c == '>') // If char is a tag-end, inTag = false; // reset to 'outside a tag' /* Else, if it is any other character, add it to what we have captured so far and rationalise it to lower case for comparison as foillows: */ else { Tag += c; String tag = Tag.toLowerCase(); /* If currently within the of the HTML document, if what we have so far accumulated is the tag, set to indicate we are no longer in the document body. */ if(inBody) { if(tag.equals("/body")) inBody = false; } /* Otherwise, if we are currently outside the document then if what we have so far accumulated is the tag, in- dicate we are now between the and tags. */ else { if(tag.equals("body")) inBody = true; } /* If we are currently between a prescribed tag pair, and we have accumulated this tag's end-tag, indicate we are no longer between this tag and its end tag. */ if(betweenTags && tag.equals("/" + Tags[t])) betweenTags = false; /* else if we are not between a prescribed tag pair, then for each type of tag being sought: if what we have so far accumulated is this tag's name shows that we are between that tag and its corresponding end tag, note which tag-pair we are now between, clear the tag name accumulator string. */ else for(int i = 0; i < T; i++) if(tag.equals(Tags[i])) { betweenTags = true; t = i; break; } /* If what we have so far accumulated is the 'br' of a
tag, return a space character to ensure word separation. */ if(tag.equals("br")) return ' '; } c = 0; // return a null character } /* Else we are not in a tag, so if char is a tag-start, set to 'inside a tag', clear the tag accumulator and return a null character. */ else if(c == '<') { inTag = true; Tag = ""; c = 0; } /* If we are outside the document or between certain prescribed tag-pairs, return a null character. */ if(!inBody || betweenTags) c = 0; return c; } }