bin/wc

#!/usr/bin/perl

=begin metadata

Name: wc
Description: paragraph, line, word, character, and byte counter
Author: Peter Prymmer, pvhp@best.com
License: perl

=end metadata

=cut

use strict;
use locale;

use File::Basename qw(basename);
use Getopt::Std qw(getopts);

use constant EX_SUCCESS => 0;
use constant EX_FAILURE => 1;

my $Program = basename($0);

sub usage {
    warn "usage: $Program [-a | [-p] [-l] [-w] [-m] [-c] ] [file...]\n";
    exit EX_FAILURE;
}

my %opt;
getopts('aplwmc', \%opt) or usage();

if ((!($opt{'p'} || $opt{'l'} || $opt{'w'} || $opt{'c'})) || ($opt{'a'})) {
   $opt{'l'} = 1; $opt{'w'} = 1; $opt{'c'} = 1;
}

if ($opt{'a'}) { $opt{'p'} = 1; $opt{'m'} = 1; }

my ($total_paras, $total_lines, $total_words, $total_chars, $total_bytes) =
 qw(0 0 0 0 0);

my $out = "";
my $par_flag = 0;

# Refer to Ken Lunde's B<CJKV Information Processing> pp. 01021-1022
# for further discussion (c) O'Reilly & Associates 1999, ISBN 1-56592-224-7.

# This encoding ought to have -m and -c yield the same counts
# no matter what the input.
# all single byte (ASCII+eight bit || ISO 8859-n || EBCDIC)
my $single_byte = q{ [\x00-\xFF] };

# UTF-8
my $utf_8 = q{
    [\x00-\x7F]                                                 # one byte
  | [\xC2-\xDF][\x80-\xBF]                                      # two byte
  | \xE0[\xA0-\xBF][\x80-\xBF]                                  # three byte
  | [\xE1-\xEF][\xA0-\xBF][\x80-\xBF]                           # three byte
  | \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]                       # four byte
  | [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]                # four byte
  | \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]            # five byte
  | [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]     # five byte
  | \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # six byte
  | \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # six byte
};

# This encoding ought to have -m yield -c/2
# no matter what the input.
# UCS-2
my $ucs_2 = q{ [\x00-\xFF][\x00-\xFF] };

# UTF-16 for little endian machines (Intel, VAX, etc.)
my $utf16_little_endian = q{
    [\x00-\xFF][\x00-\xD7\xE0-\xFF]               # UCS-2
  | [\x00-\xFF][\xD8-\xDB][\x00-\xFF][\xDC-\xDF]  # UTF-16 surrogates
};

# UTF-16 for big endian machines (Motorola, PPC, etc.)
my $utf_16_big_endian = q{
    [\x00-\xD7\xE0-\xFF][\x00-\xFF]               # UCS-2
  | [\xD8-\xDB][\x00-\xFF][\xDC-\xDF][\x00-\xFF]  # UTF-16 surrogates
};

# EUC-CN and EUC-KR
my $euc = q{
    [\x00-\x7F]              # code set 0 (ASCII or equivalent)
  | [\xA1-\xFE][\xA1-\xFE]   # code set 1 (GB 2312-80 or KS X 1001:1992)
};

# Big five
my $big_5 = q{
    [\x00-\x7F]                      # ASCII/CNS-Roman
  | [\xA1-\xFE][\x40-\x7E\xA1-\xFE]  # Big Five
};

# GBK and Big five plus
my $gbk = q{
    [\x00-\x7F]                      # ASCII or equivalent
  | [\x81-\xFE][\x40-\x7E\x80-\xFE]  # two byte (GBK or Big Five plus)
};

# EUC-TW
my $euc_tw = q{
    [\x00-\x7F]                            # code set 0 (CNS-Roman)
  | [\xA1-\xFE][\xA1-\xFE]                 # code set 1 (plane 1)
  | \x8E[\xA1-\xB0][\xA1-\xFE][\xA1-\xFE]  # code set 2 (planes 1-16)
};

# Shift-JIS
my $shift_jis = q{
    [\x00-\x7F]                               # ASCII/JIS-Roman
  | [\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]  # JIS X 0208:1997
  | [\xA0-\xDF]                               # half width katakana
};

# EUC-JP
my $euc_jp = q{
    [\x00-\x7F]                 # code set 0 (ASCII/JIS-Roman)
  | [\xA1-\xFE][\xA1-\xFE]      # code set 1 (JIS X 0208:1997)
  | \x8E[\xA0-\xDF]             # code set 2 (half width katakana)
  | \x8F[\xA1-\xFE][\xA1-\xFE]  # code set 3 (JIS X 0212:1990)
};

# Johab
my $johab = q{
    [\x00-\x7F]                               # ASCII/KS-Roman
  | [\x84-\xD3][\x41-\x7E\x81-\xFE]           # modern hangul
  | [\xD8-\xDE\xE0-\xF9][\x31-\x7E\x91-\xFE]  # symbols and hanja
};

# UHC
my $uhc = q{
    [\x00-\x7F]                               # one byte
  | [\x81-\xFE][\x41-\x5A\x61-\x7A\x81-\xFE]  # two byte
};

# change the following assignment to suite your multi-byte character
# needs:

my $encoding = $utf_8;

sub wc_fh {
    my ($fh, $filename) = @_;
    my $paras = 0;
    my $lines = 0;
    my $words = 0;
    my @words = (); # splitting into @_ is deprecated under -w
    my @chars = ();
    my $chars = 0;
    my $bytes = 0;
    # without the following binmode() byte counts on dosish machines will
    # come out the same as on unix and macs since CR's will not
    # be read in.  Unfortunately the byte count would be lower than
    # what other dosish system utilities report for file size
    # hence it is left in:
    binmode($fh);
    while (<$fh>) {
        $lines++;
        if ($opt{'p'}) {
            if (/^\s*$/) {
                if ($par_flag) {
                    $paras++;
                    $par_flag = 0;
                }
            }
            else {
                if ($paras == 0) { $paras++; }
                $par_flag = 1;
            }
        }
        if ($opt{'w'}) {
            s/\A\s+//;
            @words = split /\s+/;
            $words += scalar @words;
        }
        if ($opt{'m'}) {
            @chars = m/$encoding/gox;
            $chars += scalar(@chars);
        }
        if ($opt{'c'}) {
            $bytes += length($_);
        }
    }
    if ($paras > 1) { $paras--; }
    $total_paras += $paras;
    $total_lines += $lines; $total_words += $words;
    $total_chars += $chars; $total_bytes += $bytes;
    if (defined $filename) {
        $out = " $filename\n";
    } else {
        $out = "\n";
    }
    $out = sprintf(" %9u%s",$bytes,$out) if ($opt{'c'});
    $out = sprintf(" %9u%s",$chars,$out) if ($opt{'m'});
    $out = sprintf(" %9u%s",$words,$out) if ($opt{'w'});
    $out = sprintf(" %9u%s",$lines,$out) if ($opt{'l'});
    $out = sprintf(" %9u%s",$paras,$out) if ($opt{'p'});
    print "$out";
}

my $rc = EX_SUCCESS;
if (@ARGV) {
    foreach my $filename (@ARGV) {
        if (-d $filename) {
            warn "$Program: '$filename' is a directory\n";
            $rc = EX_FAILURE;
            next;
        }
        my $fh;
        unless (open $fh, '<', $filename) {
            warn "$Program: failed to open '$filename': $!\n";
            $rc = EX_FAILURE;
            next;
        }
        wc_fh($fh, $filename);
        unless (close $fh) {
            warn "$Program: failed to close '$filename': $!\n";
            $rc = EX_FAILURE;
        }
    }
} else {
    wc_fh(\*STDIN);
}

if ($#ARGV >= 1) {
    $out = sprintf(" %s\n","total");
    $out = sprintf(" %9u%s",$total_bytes,$out) if ($opt{'c'});
    $out = sprintf(" %9u%s",$total_chars,$out) if ($opt{'m'});
    $out = sprintf(" %9u%s",$total_words,$out) if ($opt{'w'});
    $out = sprintf(" %9u%s",$total_lines,$out) if ($opt{'l'});
    $out = sprintf(" %9u%s",$total_paras,$out) if ($opt{'p'});
    print "$out";
}
exit $rc;

__END__

=pod

=head1 NAME

wc - paragraph, line, word, character, and byte counter

=head1 SYNOPSIS

    wc [-a | [-p] [-l] [-w] [-m] [-c] ] [file...]

=head1 DESCRIPTION

I<wc> reads one or more input text files and, by default, writes the number of
lines, words, and bytes contained in each input file to the standard output.
An optional count of paragraphs or characters is also possible in this
implementation.  If more than one text file is specified, a line of total
count(s) for all named files is output on a separate line following the
last file count.  If no input files are specified, standard input is read.

By default, the standard output contains a line for each input file of
the form:

     lines     words     bytes file_name

With all options specified the output line for each input file is
of the form:

     paras     lines     words     chars     bytes file_name

I<wc> uses Perl's C<use locale> pragma.

=head2 OPTIONS

=over 5

=item none

No options to I<wc> is equivalent to specifying I<-l> I<-w>
and I<-c>.

=item B<-a>

Is equivalent to specifying I<-p> I<-l> I<-w> I<-m> and I<-c>.

=item B<-p>

Tells I<wc> to count paragraphs in the input file(s).
The algorithm employed counts lumped groups of lines that
do not contain only zero or more space characters (C</^\s*$/>).
This regular expression is sensitive to locale settings.

=item B<-l>

Tells I<wc> to count lines in the input file(s).

=item B<-w>

Tells I<wc> to count words in the input file(s) as
determined by perl's C</\w+/> regular expression which
is locale sensitive.

=item B<-m>

Tells I<wc> to count characters in the input file(s).
This is implemented with a multi-byte character counting
regular expression C<m/$encoding/gox>.  The C<$encoding>
defaults to one sensitive to well formed UTF-8 encodings
(one to six byte characters) though this may be altered
to other encodings by alteration of the program code.
Note that some 8 bit single byte characters will be
missed by the UTF-8 character counter since such characters
fall outside the UTF-8 encoding.

=item B<-c>

Tells I<wc> to count bytes in the input file(s).
This is implemented with perl's L<perlfunc/length>
built in function.

=back

=head1 ENVIRONMENT

The working of I<wc> may be influenced by your locale since it
uses the I<locale> pragma and this could have an effect on I<-w>
(word) and I<-p> (paragraph) counts.  I<wc> may also be influenced by
C<PERLLIB> or C<PERL5LIB> since it uses L<File::Basename>, L<locale>,
and L<strict> internally.

=head1 SEE ALSO

L<perllocale>.

=head1 BUGS

I<wc> has no known bugs.

=head1 STANDARDS

The I<-a> and I<-p> options are peculiarities of this Perl implementation.

=head1 AUTHOR

Peter Prymmer I<pvhp@best.com>.

=head1 COPYRIGHT and LICENSE

This program is copyright (c) by Peter Prymmer 1999.

This program is free and open software.  You may use, copy, modify, distribute
and sell this program (and any modified variants) in any way you wish,
provided you do not restrict others to do the same.

=cut