forked from briandfoy/PerlPowerTools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwc
executable file
·351 lines (274 loc) · 9.72 KB
/
wc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/usr/bin/perl
=begin metadata
Name: wc
Description: paragraph, line, word, character, and byte counter
Author: Peter Prymmer, pvhp@best.com
License: perl
=end metadata
=cut
use strict;
use locale;
use File::Basename qw(basename);
use Getopt::Std qw(getopts);
use constant EX_SUCCESS => 0;
use constant EX_FAILURE => 1;
my $Program = basename($0);
sub usage {
warn "usage: $Program [-a | [-p] [-l] [-w] [-m] [-c] ] [file...]\n";
exit EX_FAILURE;
}
my %opt;
getopts('aplwmc', \%opt) or usage();
if ((!($opt{'p'} || $opt{'l'} || $opt{'w'} || $opt{'c'})) || ($opt{'a'})) {
$opt{'l'} = 1; $opt{'w'} = 1; $opt{'c'} = 1;
}
if ($opt{'a'}) { $opt{'p'} = 1; $opt{'m'} = 1; }
my ($total_paras, $total_lines, $total_words, $total_chars, $total_bytes) =
qw(0 0 0 0 0);
my $out = "";
my $par_flag = 0;
# Refer to Ken Lunde's B<CJKV Information Processing> pp. 01021-1022
# for further discussion (c) O'Reilly & Associates 1999, ISBN 1-56592-224-7.
# This encoding ought to have -m and -c yield the same counts
# no matter what the input.
# all single byte (ASCII+eight bit || ISO 8859-n || EBCDIC)
my $single_byte = q{ [\x00-\xFF] };
# UTF-8
my $utf_8 = q{
[\x00-\x7F] # one byte
| [\xC2-\xDF][\x80-\xBF] # two byte
| \xE0[\xA0-\xBF][\x80-\xBF] # three byte
| [\xE1-\xEF][\xA0-\xBF][\x80-\xBF] # three byte
| \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF] # four byte
| [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF] # four byte
| \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # five byte
| [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # five byte
| \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # six byte
| \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] # six byte
};
# This encoding ought to have -m yield -c/2
# no matter what the input.
# UCS-2
my $ucs_2 = q{ [\x00-\xFF][\x00-\xFF] };
# UTF-16 for little endian machines (Intel, VAX, etc.)
my $utf16_little_endian = q{
[\x00-\xFF][\x00-\xD7\xE0-\xFF] # UCS-2
| [\x00-\xFF][\xD8-\xDB][\x00-\xFF][\xDC-\xDF] # UTF-16 surrogates
};
# UTF-16 for big endian machines (Motorola, PPC, etc.)
my $utf_16_big_endian = q{
[\x00-\xD7\xE0-\xFF][\x00-\xFF] # UCS-2
| [\xD8-\xDB][\x00-\xFF][\xDC-\xDF][\x00-\xFF] # UTF-16 surrogates
};
# EUC-CN and EUC-KR
my $euc = q{
[\x00-\x7F] # code set 0 (ASCII or equivalent)
| [\xA1-\xFE][\xA1-\xFE] # code set 1 (GB 2312-80 or KS X 1001:1992)
};
# Big five
my $big_5 = q{
[\x00-\x7F] # ASCII/CNS-Roman
| [\xA1-\xFE][\x40-\x7E\xA1-\xFE] # Big Five
};
# GBK and Big five plus
my $gbk = q{
[\x00-\x7F] # ASCII or equivalent
| [\x81-\xFE][\x40-\x7E\x80-\xFE] # two byte (GBK or Big Five plus)
};
# EUC-TW
my $euc_tw = q{
[\x00-\x7F] # code set 0 (CNS-Roman)
| [\xA1-\xFE][\xA1-\xFE] # code set 1 (plane 1)
| \x8E[\xA1-\xB0][\xA1-\xFE][\xA1-\xFE] # code set 2 (planes 1-16)
};
# Shift-JIS
my $shift_jis = q{
[\x00-\x7F] # ASCII/JIS-Roman
| [\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC] # JIS X 0208:1997
| [\xA0-\xDF] # half width katakana
};
# EUC-JP
my $euc_jp = q{
[\x00-\x7F] # code set 0 (ASCII/JIS-Roman)
| [\xA1-\xFE][\xA1-\xFE] # code set 1 (JIS X 0208:1997)
| \x8E[\xA0-\xDF] # code set 2 (half width katakana)
| \x8F[\xA1-\xFE][\xA1-\xFE] # code set 3 (JIS X 0212:1990)
};
# Johab
my $johab = q{
[\x00-\x7F] # ASCII/KS-Roman
| [\x84-\xD3][\x41-\x7E\x81-\xFE] # modern hangul
| [\xD8-\xDE\xE0-\xF9][\x31-\x7E\x91-\xFE] # symbols and hanja
};
# UHC
my $uhc = q{
[\x00-\x7F] # one byte
| [\x81-\xFE][\x41-\x5A\x61-\x7A\x81-\xFE] # two byte
};
# change the following assignment to suite your multi-byte character
# needs:
my $encoding = $utf_8;
sub wc_fh {
my ($fh, $filename) = @_;
my $paras = 0;
my $lines = 0;
my $words = 0;
my @words = (); # splitting into @_ is deprecated under -w
my @chars = ();
my $chars = 0;
my $bytes = 0;
# without the following binmode() byte counts on dosish machines will
# come out the same as on unix and macs since CR's will not
# be read in. Unfortunately the byte count would be lower than
# what other dosish system utilities report for file size
# hence it is left in:
binmode($fh);
while (<$fh>) {
$lines++;
if ($opt{'p'}) {
if (/^\s*$/) {
if ($par_flag) {
$paras++;
$par_flag = 0;
}
}
else {
if ($paras == 0) { $paras++; }
$par_flag = 1;
}
}
if ($opt{'w'}) {
s/\A\s+//;
@words = split /\s+/;
$words += scalar @words;
}
if ($opt{'m'}) {
@chars = m/$encoding/gox;
$chars += scalar(@chars);
}
if ($opt{'c'}) {
$bytes += length($_);
}
}
if ($paras > 1) { $paras--; }
$total_paras += $paras;
$total_lines += $lines; $total_words += $words;
$total_chars += $chars; $total_bytes += $bytes;
if (defined $filename) {
$out = " $filename\n";
} else {
$out = "\n";
}
$out = sprintf(" %9u%s",$bytes,$out) if ($opt{'c'});
$out = sprintf(" %9u%s",$chars,$out) if ($opt{'m'});
$out = sprintf(" %9u%s",$words,$out) if ($opt{'w'});
$out = sprintf(" %9u%s",$lines,$out) if ($opt{'l'});
$out = sprintf(" %9u%s",$paras,$out) if ($opt{'p'});
print "$out";
}
my $rc = EX_SUCCESS;
if (@ARGV) {
foreach my $filename (@ARGV) {
if (-d $filename) {
warn "$Program: '$filename' is a directory\n";
$rc = EX_FAILURE;
next;
}
my $fh;
unless (open $fh, '<', $filename) {
warn "$Program: failed to open '$filename': $!\n";
$rc = EX_FAILURE;
next;
}
wc_fh($fh, $filename);
unless (close $fh) {
warn "$Program: failed to close '$filename': $!\n";
$rc = EX_FAILURE;
}
}
} else {
wc_fh(\*STDIN);
}
if ($#ARGV >= 1) {
$out = sprintf(" %s\n","total");
$out = sprintf(" %9u%s",$total_bytes,$out) if ($opt{'c'});
$out = sprintf(" %9u%s",$total_chars,$out) if ($opt{'m'});
$out = sprintf(" %9u%s",$total_words,$out) if ($opt{'w'});
$out = sprintf(" %9u%s",$total_lines,$out) if ($opt{'l'});
$out = sprintf(" %9u%s",$total_paras,$out) if ($opt{'p'});
print "$out";
}
exit $rc;
__END__
=pod
=head1 NAME
wc - paragraph, line, word, character, and byte counter
=head1 SYNOPSIS
wc [-a | [-p] [-l] [-w] [-m] [-c] ] [file...]
=head1 DESCRIPTION
I<wc> reads one or more input text files and, by default, writes the number of
lines, words, and bytes contained in each input file to the standard output.
An optional count of paragraphs or characters is also possible in this
implementation. If more than one text file is specified, a line of total
count(s) for all named files is output on a separate line following the
last file count. If no input files are specified, standard input is read.
By default, the standard output contains a line for each input file of
the form:
lines words bytes file_name
With all options specified the output line for each input file is
of the form:
paras lines words chars bytes file_name
I<wc> uses Perl's C<use locale> pragma.
=head2 OPTIONS
=over 5
=item none
No options to I<wc> is equivalent to specifying I<-l> I<-w>
and I<-c>.
=item B<-a>
Is equivalent to specifying I<-p> I<-l> I<-w> I<-m> and I<-c>.
=item B<-p>
Tells I<wc> to count paragraphs in the input file(s).
The algorithm employed counts lumped groups of lines that
do not contain only zero or more space characters (C</^\s*$/>).
This regular expression is sensitive to locale settings.
=item B<-l>
Tells I<wc> to count lines in the input file(s).
=item B<-w>
Tells I<wc> to count words in the input file(s) as
determined by perl's C</\w+/> regular expression which
is locale sensitive.
=item B<-m>
Tells I<wc> to count characters in the input file(s).
This is implemented with a multi-byte character counting
regular expression C<m/$encoding/gox>. The C<$encoding>
defaults to one sensitive to well formed UTF-8 encodings
(one to six byte characters) though this may be altered
to other encodings by alteration of the program code.
Note that some 8 bit single byte characters will be
missed by the UTF-8 character counter since such characters
fall outside the UTF-8 encoding.
=item B<-c>
Tells I<wc> to count bytes in the input file(s).
This is implemented with perl's L<perlfunc/length>
built in function.
=back
=head1 ENVIRONMENT
The working of I<wc> may be influenced by your locale since it
uses the I<locale> pragma and this could have an effect on I<-w>
(word) and I<-p> (paragraph) counts. I<wc> may also be influenced by
C<PERLLIB> or C<PERL5LIB> since it uses L<File::Basename>, L<locale>,
and L<strict> internally.
=head1 SEE ALSO
L<perllocale>.
=head1 BUGS
I<wc> has no known bugs.
=head1 STANDARDS
The I<-a> and I<-p> options are peculiarities of this Perl implementation.
=head1 AUTHOR
Peter Prymmer I<pvhp@best.com>.
=head1 COPYRIGHT and LICENSE
This program is copyright (c) by Peter Prymmer 1999.
This program is free and open software. You may use, copy, modify, distribute
and sell this program (and any modified variants) in any way you wish,
provided you do not restrict others to do the same.
=cut