/** * Word counter for text or HTML files * @author Robert J Morton * @version 04 September 2000 * @copyright Sep 2000 Robert J Morton (all rights reserved) */ /* This program accepts a filespec (path + filename). It then counts the words in the given file, ignoring all HTML tags it may encounter. It submits each word it encounters to the dictionary class for inclusion in the dictionary. This program assumes all HTML files conform to strict XHTML rules: eg. no tolerance of naked & < > characters in the text. This program only counts words in the section of a document. Rules: 1) The end of a sentence is taken as being wherever a full-stop, question mark or exclamation mark is followed by whitespace. 2) Disconnected apostophe-s and apostrophe-t are not counted as separate words. See wordCounter() method below for a full list of excluded word fragments. */ import java.io.*; // for handling the file being scanned class wordCnt { private HTMLtag T; // reference to HTML tag stripper private dic D; // reference to the dictionary class private FileReader fr; // file reader for current file private int nc = 0, // number of characters in the file nw = 0, // number of words in the file nl = 0, // number of lines in the file ns = 0; // number of sentences in the file private char c; // current character private String w = ""; // string for word currently being captured private boolean insideWord = false, // true = currently inside a word endOfSentence = false, // true = end-of-sentence has been encountered beyondBodyTag = false; // start the word counting process /* Construct a new file data object from the file name, pass the dictionary generator object reference and create a new HTML tag stripper. Called from only one place in wc.main(). */ wordCnt(dic D) throws Exception { this.D = D; T = new HTMLtag(); } /* SUBSTITUTES FOR COLLOQUIAL WORD ABBREVIATIONS Called from only one place in wordCounter(). */ String abbrevs(String w) { String abbrv[] = { "didn","couldn","wasn","isn","doesn", "hadn","hasn","aren","wouldn","shouldn" }, subst[] = { "didn't","couldn't","wasn't","isn't","doesn't", "hadn't","hasn't","aren't","wouldn't","shouldn't" }; /* For each of the abbreviations, substitute any trunk- ated colloquial abbreviation with proper abbreviation.*/ for(int i = 0; i < abbrv.length; i++) if(w.equalsIgnoreCase(abbrv[i])) { w = subst[i]; break; } return w; } /* COUNTS WORDS AND SENTENCES. Called from only one place in count(). */ private void wordCounter() { // count words and sentences /* If this character is part of a word, add new character to current word and indicate that we are currently in a word. */ if(Character.isLetter(c)) { w += c; insideWord = true; } else { // if this character is other than a letter if(insideWord) { // if this is first non-letter after a word if(!w.equals("s") // Do not count a decoupled apostrophe-s && !w.equals("t") // or apostophe-t [eg: in don't, won't] && !w.equals("ll") // or the 'll on we'll etc. && !w.equals("ve") // or the 've on we've etc. && !w.equals("ts") // or the 'ts on don'ts etc. && !(w.length() == 1 // or a lone capital (an initial). && Character.isUpperCase(w.charAt(0)))) { D.submit(abbrevs(w)); //Otherwise, register word in dictionary ++nw; // and increment the number of words counted. } w = ""; // clear the word accumulator string insideWord = false; // indicate that we are outside a word } if(Character.isWhitespace(c) // If this character is && endOfSentence) { // whitespace after a .?! ns++; // increment the sentence count endOfSentence = false; // clear the end-of-sentence flag. } else // Else if this character if(c == '.' // is a full-stop || c == '?' // or a question-mark || c == '!') // or an exclamation mark endOfSentence = true; // then set the end-of-sentence flag. } // end of 'if this character is other than a letter' } /* COUNT THE CHARACTERS, WORDS, LINES, SENTENCES IN AN HTML FILE Called from only one place in wc.scan() */ boolean count(String fp) { w = ""; // string for word currently being captured nc = 0; // number of characters in the file nw = 0; // number of words in the file ns = 0; // number of sentences in the file insideWord = false; // true = we are currently inside a word endOfSentence = false; // true = an end-of-sentence has been encountered beyondBodyTag = false; // start the word counting process int x = 0; // for next input character boolean flag = true; // = no malformed character entities in this file try { fr = new FileReader(fp); // create a file reader for this file while((x = fr.read()) != -1) { // loop broken by End-Of-File /* Provided character is not part of an HTML tag AND it is not part of an un-substituted character entity AND provided we are now beyond the HTML tag THEN... */ if((c = T.capture((char)x)) != 0 && (c = charEnt.capture(c)) != 0){ /* If a character-entity error is encountered in this file, set the 'malformed character entity' flag to in- dicate that a character entity error has occurred. */ if(charEnt.getCEE()) flag = false; ++nc; // increment the character count wordCounter(); // do word count } } fr.close(); // close the file reader } catch(Exception e) { } // catches end-of-file exception return flag; // true if malformed char ent was encountered } // EACH CALLED FROM ONLY ONE PLACE IN WC.SCAN(). int getCharCount() {return nc;} // number of characters in the file int getWordCount() {return nw;} // number of words in the file int getSentCount() {return ns;} // number of sentences in the file }