From 495ae4b01f36d0d8bb0e34a1d7263c2b8224aa4a Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Thu, 7 Oct 2004 15:04:35 +0000 Subject: Start --- doc/doc-scripts/fc2k | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100755 doc/doc-scripts/fc2k (limited to 'doc/doc-scripts/fc2k') diff --git a/doc/doc-scripts/fc2k b/doc/doc-scripts/fc2k new file mode 100755 index 000000000..936392979 --- /dev/null +++ b/doc/doc-scripts/fc2k @@ -0,0 +1,344 @@ +#! /usr/bin/perl -w +# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $ + +# Script to read the HTML table of contents for the Exim FAQ and create an +# HTML KWIC index out of it. + + +######################################################################## +# List of words to ignore - kept alphabetically for reference, but they +# don't have to be in order. + +$ignore_list = " + +a ability able about address addresses addressed affect affected +after against all allow allowed allows already also although always am an and +and/or any anybody anyone anything anywhere are aren't arrange arrive as at + +back bad based basically be because been behave behaviour being best between +bob both bug build builds built busy but by + +call called calls can can't cannot causes causing central certain code comes +coming command commands complain complaining complains configure configured +conjunction contact contain contains contained correct correctly could +currently customer + +day days defined deliver delivers delivered delivery deliveries did do does +doesn't doing don't down during + +e-mail e-mails each easy else email emails entirely entries entry especially +etc even ever every example exim exim's experiencing + +far few file files find fine fly following for form found from fully + +get gets getting given gives giving go goes going got + +handle handles handled handling happen happens has have haven't having helpful +him host hosts how however + +i i'd i'm i've if in indeed instead into is issue issues isn't it it's its + +jim just + +keep keeps know knows + +like line lines look looked looking lot + +machine machines machine's mail mails main make me mean means message messages +might more must my myself + +near need neither no nor not now + +occur of off often ok on one only or other our out over own + +part parts particular per place possibility possible present problem problems +put puts + +quite + +raised rather really reason rid right round run runs + +same say saying see seeing seem seems seen sees set setting she should so some +somehow something sometimes stand state statement still strange such supposed +system systems + +take takes than that the their them then there these they things think this +those to try though to/for told too tried tries trying + +under until up use uses used using usually + +valid value values via + +want wanted wanting was way we we've well what what's when where whereabouts +whenever whether which while who whose why will with within without wish won't +wondered work worked working works would wrong + +xxx + +yet yyy + +"; +######################################################################## + + +# The regular expression fragment that defines the separator between words + +$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*"; + + +######################################################################## +# Function to add to a length to accommodate HTML stuff + +sub setlen{ +my($len, $s) = @_; + +$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig); +$len += 1 while ($s =~ /&#\d+;/g); + +return $len; +} + + +######################################################################## +# Function to write out the list of initials with references + +sub write_initials { +my($this_initial) = "$_[0]"; + +print OUT "

\n "; + +foreach $initial (sort keys %initials) + { + if ($initial eq $this_initial) + { + print OUT " $initial "; + } + else + { + print OUT " $initial"; + } + } + +print OUT " "x4 . "FAQ Contents\n

\n"; +} + + + +######################################################################## +# The main program. We can pick out the contents lines because they lie +# between

and

in the file, sometimes on more than one physical +# line. + +# Turn the list of ignorable words into a hash for quick lookup. Add the +# empty word to the list. + +@words = split /\s+/, $ignore_list; +foreach $word (@words) { $ignore{$word} = 1; } +$ignore{""} = 1; + + +# Open the file and do the job + +open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n"; + +while () + { + next unless /^

/; + $_ .= while !/<\/li>$/; + chomp; + s/\n\s*/ /g; + + # Extract the operative text into $text, with the beginning in $pre. + + my($pre,$text,$post) = /^

(.*<\/a>:(?: )*)(.*)

<\/li>$/; + + # Now split into words. As well as punctuation, there may be HTML thingies + # between words. Absorb them into the separators. + + my(@words) = split /$wordgap/, $text; + + # Lower case all the words, and remove those that we don't want. + # Then keep a list of all the used initials. + + REMOVE_IGNORE: + for ($i = 0; $i < scalar @words; $i++) + { + my($word) = $words[$i] = "\L$words[$i]\E"; + + # Remove certain forms of word and those on the ignore list + + if (defined $ignore{$word} || # word on ignore list + $word =~ /^-+$/ || # word consists entirely of hyphens + $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter + $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen + $word =~ /[@^.]/ # contains @ or ^ or . + ) + { + splice(@words, $i, 1); + redo REMOVE_IGNORE if $i < scalar @words; + } + + # Otherwise, build up a list of initials + + else + { + my($inword) = $word; + $inword =~ s/^-//; + $initial = substr($inword, 0, 1); + $initials{"\U$initial\E"} = 1; + } + } + + # Create the lines for the KWIC index, and store them in associative + # arrays, with the keyword as the key. That will get them sorted + # automatically. + + while (scalar @words > 0) + { + my($word) = shift @words; + my($pretext, $casedword, $posttext) = + $text =~ /(.*?)(? $leftlen) + { + my($cutoff) = $leftlen; + $cutoff++ + while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " "); + $pretext = "... " . substr($pretext, -$cutoff); + } + + if ($postlen > $rightlen) + { + my($cutoff) = $rightlen; + $cutoff++ + while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " "); + $posttext = substr($posttext, 0, $cutoff) . "..."; + } + + # If the pre text has a font-ending not preceded by a font beginning + # (i.e. we've chopped the beginning off), we must insert a beginning. + + while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/) + { + $pretext = "<$2>" . $pretext; + } + + # If the pre text ends in a special font, we have to terminate that, + # and reset it at the start of the post text. + + my($poststart) = ""; + + while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/) + { + $pretext .= ""; + $poststart .= "<$1>"; + } + + # If the post text changes font but doesn't close it, we must add + # the closure. + + while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/) + { + $posttext .= ""; + } + + # Remove any unnecessary changes in either of them + + $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g; + $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g; + + # Save the texts in associative arrays. Add the question number to + # the end of the word to make the key. + + $pre =~ /(Q\d\d\d\d)/; + my($key) = "$word-$1"; + + $tableft{$key} = $pre . $pretext; + $tabright{$key} = $poststart . + "$casedword" . $posttext; + } + } + +close(IN); + +# Now write out the files. Each letter in the index goes in a different file + +$current_initial = ""; + +foreach $key (sort keys %tableft) + { + my($initial) = $key =~ /^(.)/; + $initial = "\U$initial\E"; + + if ($initial ne $current_initial) + { + if ($current_initial ne "") + { + print OUT "\n"; + &write_initials($current_initial); + print OUT "\n\n"; + close OUT; + } + + open (OUT, ">html/FAQ-KWIC_$initial.html") || + die "Can't open html/FAQ-KWIC_$initial.html\n"; + print OUT + "\n" . + "\n" . + "Exim FAQ: KWIC index section $initial\n" . + "\n" . + "\n" . + "

Exim FAQ: Keyword-in-context index

\n"; + + write_initials($initial); + + if ($initial eq "A") + { + print OUT < +This Keyword-in-context index for the Exim FAQ is generated +automatically from the FAQ source. Browsers may not display the data very +prettily, but it is hoped that it may provide a useful aid for finding things +in the FAQ. +

+End + } + + print OUT "\n"; + $current_initial = $initial; + } + + print OUT "\n"; + print OUT "\n"; + print OUT "\n"; + print OUT "\n"; + } + +# Close the final file + +if ($current_initial ne "") + { + print OUT "

$tableft{$key}

$tabright{$key}

\n"; + &write_initials($current_initial); + print OUT "\n\n"; + close OUT; + } + +# End -- cgit v1.2.3