/** * HTML file word counter and dictionary builder. * @author Robert John Morton * @version 05 Sep 2000 */ /* This code uses the Java 1.1.8 API. Usage: Put the parent directory name in arg0 Sample command line entry: "java wc /home/rob/Private website" Do NOT put a final slash. This program counts the words in each HTML file it finds under a specified parent directory. It lists the full path to each file, together with its character count, word count and sentence count. It places the list in "wc.txt". The counting process excludes HTML tags and between certain prescribed tags (see HTMLtag.java). It also compiles a dictionary of all the words found in all the files and notes the total number of occurrences of each word for all the files put together. It puts the dictionary in the file "dic.txt". An asterisk is placed in front of the file path of files that contain any malformed character entities. */ import java.io.*; class wc { private static wordCnt ct; // ref to an instance of a word counter object private static dic D; // ref to an instance of a dictionary generator private static BufferedWriter o; // for output text file private static int widths[] = {60,10,9,10,8}, // column widths chars = 0, totalChars = 0, // count accumulators words = 0, totalWords = 0, sents = 0, totalSents = 0, dl = 0, // length of parent directory path name inc terminating '/' lang = 0; // language switch 0=english, 1=portuguese private static String title = "CHARACTERS WORDS SENTENCES WORDS/S FILESPEC" + " *contains malformed charents", ws = " ", // for words per sentence count s; // for general use universally /* Pads out a numeric string with leading spaces in oder to make the string a prescribed length. Called from 4 places each in both main() and scan(). */ private static String format(String s, int n) { for(int i = s.length(); i < widths[n]; i++) s = " " + s; return s; } /* When invoked, this method examines the files and directories contained within the directory 'd' passed to it as its parameter. If an entry is an HTML file, it writes that file's relative filespec + word count to 'wc.txt'. The 'relative' filespec is the path+filename from the point of view of the current directory. If an entry is a directory, it simply calls itself to deal with that (sub) directory as it is doing with the current direct- ory. Thus it can handle any depth of sub-directories from the parent. Called only by itself and main(). Calls ct.count() and ct.getCharCount(). */ private static void scan(String d) throws IOException { char ch = ' '; File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all items in this directory /* For each HTML file in the sub-directory, get name of [next] sub-directory and create a file object for it. */ for(int i = 0; i < D.length; i++) { s = d + "/" + D[i]; File fs = new File(s); /* Provided it is an existing directory and it isn't a development directory, then re-enter this method. */ if(fs.isDirectory() && !s.endsWith("webtools") && !s.endsWith("images") && !s.endsWith("articles-pdf") && !s.endsWith("applets") && !s.endsWith("java_progs")) scan(s); else // else it should be a file if(fs.isFile() // so, if the file exists: && ((lang == 0 // if searching for English && s.endsWith(".html") && !s.endsWith("_br.html") && !s.endsWith("index.html")) || (lang == 1 // if searching for Portuguese && s.endsWith("_br.html")) ) ) { /* Count the characters, words and sentences in the file. If the file contained no malformed character entities, prefix its list entry with a space, otherwise mark it with an asterisk. Then form its filepath string prefixed with an asterisk if necessary.*/ if(ct.count(s)) ch = ' '; else ch = '*'; s = "" + ch + " " + s.substring(dl, s.length()); /* Add this file's character, word and sentence counts to their respective totals accumulators. */ totalChars += (chars = ct.getCharCount()); totalWords += (words = ct.getWordCount()); totalSents += (sents = ct.getSentCount()); /* Provided document contains at least one sentence, write the words per sentence ratio to the end of the line. Otherwise, to avoid a possible division by zero, write it as a blank field. */ if(sents > 0) ws = format(Integer.toString(words/sents),4); else ws = " ---"; /* ASSEMBLE THE PRINT LINE FOR THIS HTML FILE. Pad out and append its character count, word count & sentence count followed by its average words per sentence ratio, adding finally the relative filespec. */ s = format(Integer.toString(chars),1) + format(Integer.toString(words),2) + format(Integer.toString(sents),3) + ws + " " + s; /* Write the finished line to the output file, followed by a system- specific 'new-line', and also to the terminal console. */ o.write(s,0,s.length()); o.newLine(); } } } public static void main(String args[]) throws Exception { /* the only allowed arguement is "-pt" for counting words in articles written in portuguese. */ if(args.length > 0 && args[0].equals("-pt")) lang = 1; /* Form the full path and note its length + 1. Then create a file object from the full path. */ String d = "../../"; dl = d.length(); File pd = new File(d); /* If path is an existing directory, open output file in an output stream writer and wrap that in a buffered writer in order to be able to use write() */ if(pd.isDirectory()) { o = new BufferedWriter( new OutputStreamWriter( new FileOutputStream("wc.txt") ) ); /* Print the table's column headings followed by a system-specific 'new-line' to the output file and also to the system console. */ o.write(title,0,title.length()); o.newLine(); D = new dic(); // create instance of the dictionary generator ct = new wordCnt(D); // create instance of the word counter scan(d); // scan for HTML files in the specified directory tree o.newLine(); // miss a line ready for the totals /* Provided there is at least one sentence in document, output the total number of words and total number of sentences, else, to avoid the possibility of division by zero, output a blank field. */ if(totalSents > 0) ws = format(Integer.toString(totalWords/totalSents),4); else ws = " "; /* ASSEMBLE THE TOTALS PRINT LINE. Pad out and append the total character count, word count & sentence count followed by the overall average words per sentence ratio. */ s = format(Integer.toString(totalChars),1) + format(Integer.toString(totalWords),2) + format(Integer.toString(totalSents),3) + ws + " TOTALS"; o.write(s,0,s.length()); // write finished totals line to output file o.newLine(); // system-specific 'new-line' character(s) o.close(); // close the output file System.out.println(s); // print the line to the console also D.save(); // save the dictionary to a file dic.txt System.out.println("Results of this word count are in wc.txt."); } else System.out.println(d + " is not a directory."); } }