1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
|
#! /usr/bin/perl -w
# Script to read the HTML table of contents for the Exim FAQ and create an
# HTML KWIC index out of it.
########################################################################
# List of words to ignore - kept alphabetically for reference, but they
# don't have to be in order.
$ignore_list = "
a ability able about absence access according actual address addresses addressed
affect affected after against aka all allow allowed allows along already also
although always am amount an ancient and and/or annoying another any anybody
anyone anything anywhere apparent apparently are aren't around arrange arrive
arrives as at
back bad based basically be because been behave behaviour being best between
bob both box bug build builds built busy but by
call called calls can can't cannot causes causing central certain code comes
coming command commands complain complaining complains configure configured
conjunction contact contain contains contained correct correctly could
currently customer
day days defined deliver delivers delivered delivery deliveries did do does
doesn't doing don't down during
e-mail e-mails each easy either else email emails entirely entries entry
especially etc even ever every example exim exim's experiencing
far few file files find finds fine fix fixed fly following for form found from
fully
generate generated get gets getting given gives giving go goes going got
handle handles handled handling happen happens has have haven't having helpful
him host hosts how however
i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
jim just
keep keeps know knows
like line lines look looked looking lot
m machine machines machine's mail mails main make me mean means message messages
might more much must my myself
near need neither no nor not now
occur of off often ok on one only or other our out over own
part parts particular per place possibility possible present problem problems
put puts
quite
raised rather really reason rid right round run runs
same say saying see seeing seem seems seen sees set setting she should simply
sit so some somehow something sometimes stand state statement still strange such
supposed system systems
take takes tell than that the their them then there these they things think this
those thought to try though to/for told too tried tries trying
under until up use uses used using usually
valid value values via
want wanted wanting was way we we've well what what's when where whereabouts
whenever whether which while who whose why will with within without wish won't
wondered work worked working works would wrong
xxx
yet yyy
";
########################################################################
# The regular expression fragment that defines the separator between words
$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
########################################################################
# Function to add to a length to accommodate HTML stuff
sub setlen{
my($len, $s) = @_;
$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
$len += 1 while ($s =~ /&#\d+;/g);
return $len;
}
########################################################################
# Function to write out the list of initials with references
sub write_initials {
my($this_initial) = "$_[0]";
print OUT "<p>\n ";
foreach $initial (sort keys %initials)
{
if ($initial eq $this_initial)
{
print OUT " <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
}
else
{
print OUT "<a href=\"FAQ-KWIC_$initial.html\"> $initial</a>";
}
}
print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
}
########################################################################
# The main program. We can pick out the contents lines because they lie
# between <li> and </li> in the file, sometimes on more than one physical
# line.
# Turn the list of ignorable words into a hash for quick lookup. Add the
# empty word to the list.
@words = split /\s+/, $ignore_list;
foreach $word (@words) { $ignore{$word} = 1; }
$ignore{""} = 1;
# Open the file and do the job
open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
while (<IN>)
{
next unless /^<li>/;
$_ .= <IN> while !/<\/li>$/;
chomp;
s/\n\s*/ /g;
# Extract the operative text into $text, with the beginning in $pre.
my($pre,$text,$post) = /^<li>(.*<\/a>:(?: )*)(.*)<br><br><\/li>$/;
# Now split into words. As well as punctuation, there may be HTML thingies
# between words. Absorb them into the separators.
my(@words) = split /$wordgap/, $text;
# Lower case all the words, and remove those that we don't want.
# Then keep a list of all the used initials.
REMOVE_IGNORE:
for ($i = 0; $i < scalar @words; $i++)
{
my($word) = $words[$i] = "\L$words[$i]\E";
# Remove certain forms of word and those on the ignore list
if (defined $ignore{$word} || # word on ignore list
$word =~ /^-+$/ || # word consists entirely of hyphens
$word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
$word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
$word =~ /[@^.]/ # contains @ or ^ or .
)
{
splice(@words, $i, 1);
redo REMOVE_IGNORE if $i < scalar @words;
}
# Otherwise, build up a list of initials
else
{
my($inword) = $word;
$inword =~ s/^-//;
$initial = substr($inword, 0, 1);
$initials{"\U$initial\E"} = 1;
}
}
# Create the lines for the KWIC index, and store them in associative
# arrays, with the keyword as the key. That will get them sorted
# automatically.
while (scalar @words > 0)
{
my($word) = shift @words;
my($pretext, $casedword, $posttext) =
$text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
# Remove a leading hyphen from $word so that it sorts according to
# the leading letter. What is actually output is $casedword, which
# retains the hyphen.
$word =~ s/^-//;
my($prelen) = length $pretext;
my($postlen) = length $posttext;
# We want to chop excessively long entries on either side. We can't set
# a fixed length because of the HTML control data. Call a function to
# add the given length to allow for HTML stuff. This is crude, but it
# does roughly the right thing.
my($leftlen) = &setlen(70, $pretext);
my($rightlen) = &setlen(70, $posttext);
if ($prelen > $leftlen)
{
my($cutoff) = $leftlen;
$cutoff++
while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
$pretext = "... " . substr($pretext, -$cutoff);
}
if ($postlen > $rightlen)
{
my($cutoff) = $rightlen;
$cutoff++
while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
$posttext = substr($posttext, 0, $cutoff) . "...";
}
# If the pre text has a font-ending not preceded by a font beginning
# (i.e. we've chopped the beginning off), we must insert a beginning.
while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
{
$pretext = "<$2>" . $pretext;
}
# If the pre text ends in a special font, we have to terminate that,
# and reset it at the start of the post text.
my($poststart) = "";
while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
{
$pretext .= "</$1>";
$poststart .= "<$1>";
}
# If the post text changes font but doesn't close it, we must add
# the closure.
while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
{
$posttext .= "</$1>";
}
# Remove any unnecessary changes in either of them
$pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
$posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
# Save the texts in associative arrays. Add the question number to
# the end of the word to make the key.
$pre =~ /(Q\d\d\d\d)/;
my($key) = "$word-$1";
$tableft{$key} = $pre . $pretext;
$tabright{$key} = $poststart .
"<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
}
}
close(IN);
# Now write out the files. Each letter in the index goes in a different file
$current_initial = "";
foreach $key (sort keys %tableft)
{
my($initial) = $key =~ /^(.)/;
$initial = "\U$initial\E";
if ($initial ne $current_initial)
{
if ($current_initial ne "")
{
print OUT "</table>\n";
&write_initials($current_initial);
print OUT "</body>\n</html>\n";
close OUT;
}
open (OUT, ">html/FAQ-KWIC_$initial.html") ||
die "Can't open html/FAQ-KWIC_$initial.html\n";
print OUT
"<html>\n" .
"<head>\n" .
"<title>Exim FAQ: KWIC index section $initial</title>\n" .
"</head>\n" .
"<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
"<h1>Exim FAQ: Keyword-in-context index</h1>\n";
write_initials($initial);
if ($initial eq "A")
{
print OUT <<End ;
<p>
This <i>Keyword-in-context</i> index for the Exim FAQ is generated
automatically from the FAQ source. Browsers may not display the data very
prettily, but it is hoped that it may provide a useful aid for finding things
in the FAQ.
</p>
End
}
print OUT "<table border>\n";
$current_initial = $initial;
}
print OUT "<tr>\n";
print OUT "<td align=\"right\">$tableft{$key}</td>\n";
print OUT "<td align=\"left\">$tabright{$key}</td>\n";
print OUT "</tr>\n";
}
# Close the final file
if ($current_initial ne "")
{
print OUT "</table>\n";
&write_initials($current_initial);
print OUT "</body>\n</html>\n";
close OUT;
}
# End
|