/**
* HTML file word counter and dictionary builder.
* @author Robert John Morton
* @version 05 Sep 2000 */
/* This code uses the Java 1.1.8 API. Usage: Put the parent directory name in
arg0 Sample command line entry: "java wc /home/rob/Private website" Do
NOT put a final slash. This program counts the words in each HTML file it
finds under a specified parent directory. It lists the full path to each
file, together with its character count, word count and sentence count. It
places the list in "wc.txt". The counting process excludes HTML tags and
between certain prescribed tags (see HTMLtag.java). It also compiles a
dictionary of all the words found in all the files and notes the total
number of occurrences of each word for all the files put together. It puts
the dictionary in the file "dic.txt". An asterisk is placed in front of
the file path of files that contain any malformed character entities. */
import java.io.*;
class wc {
private static wordCnt ct; // ref to an instance of a word counter object
private static dic D; // ref to an instance of a dictionary generator
private static BufferedWriter o; // for output text file
private static int
widths[] = {60,10,9,10,8}, // column widths
chars = 0, totalChars = 0, // count accumulators
words = 0, totalWords = 0,
sents = 0, totalSents = 0,
dl = 0, // length of parent directory path name inc terminating '/'
lang = 0; // language switch 0=english, 1=portuguese
private static String
title = "CHARACTERS WORDS SENTENCES WORDS/S FILESPEC"
+ " *contains malformed charents",
ws = " ", // for words per sentence count
s; // for general use universally
/* Pads out a numeric string with leading spaces
in oder to make the string a prescribed length.
Called from 4 places each in both main() and scan(). */
private static String format(String s, int n) {
for(int i = s.length(); i < widths[n]; i++)
s = " " + s;
return s;
}
/* When invoked, this method examines the files and directories contained
within the directory 'd' passed to it as its parameter. If an entry is an
HTML file, it writes that file's relative filespec + word count to 'wc.txt'.
The 'relative' filespec is the path+filename from the point of view of
the current directory. If an entry is a directory, it simply calls itself
to deal with that (sub) directory as it is doing with the current direct-
ory. Thus it can handle any depth of sub-directories from the parent.
Called only by itself and main(). Calls ct.count() and ct.getCharCount(). */
private static void scan(String d) throws IOException {
char ch = ' ';
File fd = new File(d); // create file object for given directory name
String D[] = fd.list(); // list all items in this directory
/* For each HTML file in the sub-directory, get name of
[next] sub-directory and create a file object for it. */
for(int i = 0; i < D.length; i++) {
s = d + "/" + D[i];
File fs = new File(s);
/* Provided it is an existing directory and it isn't a
development directory, then re-enter this method. */
if(fs.isDirectory()
&& !s.endsWith("webtools")
&& !s.endsWith("images")
&& !s.endsWith("articles-pdf")
&& !s.endsWith("applets")
&& !s.endsWith("java_progs"))
scan(s);
else // else it should be a file
if(fs.isFile() // so, if the file exists:
&& ((lang == 0 // if searching for English
&& s.endsWith(".html")
&& !s.endsWith("_br.html")
&& !s.endsWith("index.html"))
|| (lang == 1 // if searching for Portuguese
&& s.endsWith("_br.html"))
)
) {
/* Count the characters, words and sentences in the file. If the
file contained no malformed character entities, prefix its list
entry with a space, otherwise mark it with an asterisk. Then
form its filepath string prefixed with an asterisk if necessary.*/
if(ct.count(s))
ch = ' ';
else
ch = '*';
s = "" + ch + " " + s.substring(dl, s.length());
/* Add this file's character, word and sentence
counts to their respective totals accumulators. */
totalChars += (chars = ct.getCharCount());
totalWords += (words = ct.getWordCount());
totalSents += (sents = ct.getSentCount());
/* Provided document contains at least one sentence, write the
words per sentence ratio to the end of the line. Otherwise, to
avoid a possible division by zero, write it as a blank field. */
if(sents > 0)
ws = format(Integer.toString(words/sents),4);
else
ws = " ---";
/* ASSEMBLE THE PRINT LINE FOR THIS HTML FILE. Pad out and append
its character count, word count & sentence count followed by its
average words per sentence ratio, adding finally the relative
filespec. */
s = format(Integer.toString(chars),1)
+ format(Integer.toString(words),2)
+ format(Integer.toString(sents),3)
+ ws + " " + s;
/* Write the finished line to the output file, followed by a system-
specific 'new-line', and also to the terminal console. */
o.write(s,0,s.length());
o.newLine();
}
}
}
public static void main(String args[]) throws Exception {
/* the only allowed arguement is "-pt" for counting
words in articles written in portuguese. */
if(args.length > 0 && args[0].equals("-pt"))
lang = 1;
/* Form the full path and note its length + 1.
Then create a file object from the full path. */
String d = "../../";
dl = d.length();
File pd = new File(d);
/* If path is an existing directory, open output file
in an output stream writer and wrap that in a buffered
writer in order to be able to use write() */
if(pd.isDirectory()) {
o = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("wc.txt")
)
);
/* Print the table's column headings followed by a system-specific
'new-line' to the output file and also to the system console. */
o.write(title,0,title.length());
o.newLine();
D = new dic(); // create instance of the dictionary generator
ct = new wordCnt(D); // create instance of the word counter
scan(d); // scan for HTML files in the specified directory tree
o.newLine(); // miss a line ready for the totals
/* Provided there is at least one sentence in document, output the
total number of words and total number of sentences, else, to avoid
the possibility of division by zero, output a blank field. */
if(totalSents > 0)
ws = format(Integer.toString(totalWords/totalSents),4);
else
ws = " ";
/* ASSEMBLE THE TOTALS PRINT LINE. Pad out and append the
total character count, word count & sentence count followed
by the overall average words per sentence ratio. */
s = format(Integer.toString(totalChars),1)
+ format(Integer.toString(totalWords),2)
+ format(Integer.toString(totalSents),3)
+ ws + " TOTALS";
o.write(s,0,s.length()); // write finished totals line to output file
o.newLine(); // system-specific 'new-line' character(s)
o.close(); // close the output file
System.out.println(s); // print the line to the console also
D.save(); // save the dictionary to a file dic.txt
System.out.println("Results of this word count are in wc.txt.");
}
else System.out.println(d + " is not a directory.");
}
}