/**
* Word counter for text or HTML files
* @author Robert J Morton
* @version 04 September 2000
* @copyright Sep 2000 Robert J Morton (all rights reserved) */
/* This program accepts a filespec (path + filename). It then counts the
words in the given file, ignoring all HTML tags it may encounter. It
submits each word it encounters to the dictionary class for inclusion
in the dictionary. This program assumes all HTML files conform to strict
XHTML rules: eg. no tolerance of naked & < > characters in the text.
This program only counts words in the section of a document.
Rules:
1) The end of a sentence is taken as being wherever a full-stop,
question mark or exclamation mark is followed by whitespace.
2) Disconnected apostophe-s and apostrophe-t are not counted as
separate words. See wordCounter() method below for a full list
of excluded word fragments. */
import java.io.*; // for handling the file being scanned
class wordCnt {
private HTMLtag T; // reference to HTML tag stripper
private dic D; // reference to the dictionary class
private FileReader fr; // file reader for current file
private int
nc = 0, // number of characters in the file
nw = 0, // number of words in the file
nl = 0, // number of lines in the file
ns = 0; // number of sentences in the file
private char c; // current character
private String w = ""; // string for word currently being captured
private boolean
insideWord = false, // true = currently inside a word
endOfSentence = false, // true = end-of-sentence has been encountered
beyondBodyTag = false; // start the word counting process
/* Construct a new file data object from the file name, pass the
dictionary generator object reference and create a new HTML tag
stripper. Called from only one place in wc.main(). */
wordCnt(dic D) throws Exception {
this.D = D;
T = new HTMLtag();
}
/* SUBSTITUTES FOR COLLOQUIAL WORD ABBREVIATIONS
Called from only one place in wordCounter(). */
String abbrevs(String w) {
String
abbrv[] = {
"didn","couldn","wasn","isn","doesn",
"hadn","hasn","aren","wouldn","shouldn"
},
subst[] = {
"didn't","couldn't","wasn't","isn't","doesn't",
"hadn't","hasn't","aren't","wouldn't","shouldn't"
};
/* For each of the abbreviations, substitute any trunk-
ated colloquial abbreviation with proper abbreviation.*/
for(int i = 0; i < abbrv.length; i++)
if(w.equalsIgnoreCase(abbrv[i])) {
w = subst[i];
break;
}
return w;
}
/* COUNTS WORDS AND SENTENCES.
Called from only one place in count(). */
private void wordCounter() { // count words and sentences
/* If this character is part of a word, add new character to
current word and indicate that we are currently in a word. */
if(Character.isLetter(c)) {
w += c;
insideWord = true;
}
else { // if this character is other than a letter
if(insideWord) { // if this is first non-letter after a word
if(!w.equals("s") // Do not count a decoupled apostrophe-s
&& !w.equals("t") // or apostophe-t [eg: in don't, won't]
&& !w.equals("ll") // or the 'll on we'll etc.
&& !w.equals("ve") // or the 've on we've etc.
&& !w.equals("ts") // or the 'ts on don'ts etc.
&& !(w.length() == 1 // or a lone capital (an initial).
&& Character.isUpperCase(w.charAt(0)))) {
D.submit(abbrevs(w)); //Otherwise, register word in dictionary
++nw; // and increment the number of words counted.
}
w = ""; // clear the word accumulator string
insideWord = false; // indicate that we are outside a word
}
if(Character.isWhitespace(c) // If this character is
&& endOfSentence) { // whitespace after a .?!
ns++; // increment the sentence count
endOfSentence = false; // clear the end-of-sentence flag.
}
else // Else if this character
if(c == '.' // is a full-stop
|| c == '?' // or a question-mark
|| c == '!') // or an exclamation mark
endOfSentence = true; // then set the end-of-sentence flag.
} // end of 'if this character is other than a letter'
}
/* COUNT THE CHARACTERS, WORDS, LINES, SENTENCES IN AN HTML FILE
Called from only one place in wc.scan() */
boolean count(String fp) {
w = ""; // string for word currently being captured
nc = 0; // number of characters in the file
nw = 0; // number of words in the file
ns = 0; // number of sentences in the file
insideWord = false; // true = we are currently inside a word
endOfSentence = false; // true = an end-of-sentence has been encountered
beyondBodyTag = false; // start the word counting process
int x = 0; // for next input character
boolean flag = true; // = no malformed character entities in this file
try {
fr = new FileReader(fp); // create a file reader for this file
while((x = fr.read()) != -1) { // loop broken by End-Of-File
/* Provided character is not part of an HTML tag AND it is
not part of an un-substituted character entity AND provided
we are now beyond the HTML tag THEN... */
if((c = T.capture((char)x)) != 0 && (c = charEnt.capture(c)) != 0){
/* If a character-entity error is encountered in this
file, set the 'malformed character entity' flag to in-
dicate that a character entity error has occurred. */
if(charEnt.getCEE())
flag = false;
++nc; // increment the character count
wordCounter(); // do word count
}
}
fr.close(); // close the file reader
}
catch(Exception e) { } // catches end-of-file exception
return flag; // true if malformed char ent was encountered
}
// EACH CALLED FROM ONLY ONE PLACE IN WC.SCAN().
int getCharCount() {return nc;} // number of characters in the file
int getWordCount() {return nw;} // number of words in the file
int getSentCount() {return ns;} // number of sentences in the file
}