diff options
author | Philip Hazel <ph10@hermes.cam.ac.uk> | 2004-10-07 15:04:35 +0000 |
---|---|---|
committer | Philip Hazel <ph10@hermes.cam.ac.uk> | 2004-10-07 15:04:35 +0000 |
commit | 495ae4b01f36d0d8bb0e34a1d7263c2b8224aa4a (patch) | |
tree | fcfaa2c623d4f155eef907b50b950b602829a30b /doc/doc-scripts/fc2k | |
parent | 0756eb3cb50d73a77b486e47528f7cb1bffdb299 (diff) |
Start
Diffstat (limited to 'doc/doc-scripts/fc2k')
-rwxr-xr-x | doc/doc-scripts/fc2k | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/doc/doc-scripts/fc2k b/doc/doc-scripts/fc2k new file mode 100755 index 000000000..936392979 --- /dev/null +++ b/doc/doc-scripts/fc2k @@ -0,0 +1,344 @@ +#! /usr/bin/perl -w +# $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $ + +# Script to read the HTML table of contents for the Exim FAQ and create an +# HTML KWIC index out of it. + + +######################################################################## +# List of words to ignore - kept alphabetically for reference, but they +# don't have to be in order. + +$ignore_list = " + +a ability able about address addresses addressed affect affected +after against all allow allowed allows already also although always am an and +and/or any anybody anyone anything anywhere are aren't arrange arrive as at + +back bad based basically be because been behave behaviour being best between +bob both bug build builds built busy but by + +call called calls can can't cannot causes causing central certain code comes +coming command commands complain complaining complains configure configured +conjunction contact contain contains contained correct correctly could +currently customer + +day days defined deliver delivers delivered delivery deliveries did do does +doesn't doing don't down during + +e-mail e-mails each easy else email emails entirely entries entry especially +etc even ever every example exim exim's experiencing + +far few file files find fine fly following for form found from fully + +get gets getting given gives giving go goes going got + +handle handles handled handling happen happens has have haven't having helpful +him host hosts how however + +i i'd i'm i've if in indeed instead into is issue issues isn't it it's its + +jim just + +keep keeps know knows + +like line lines look looked looking lot + +machine machines machine's mail mails main make me mean means message messages +might more must my myself + +near need neither no nor not now + +occur of off often ok on one only or other our out over own + +part parts particular per place possibility possible present problem problems +put puts + +quite + +raised rather really reason rid right round run runs + +same say saying see seeing seem seems seen sees set setting she should so some +somehow something sometimes stand state statement still strange such supposed +system systems + +take takes than that the their them then there these they things think this +those to try though to/for told too tried tries trying + +under until up use uses used using usually + +valid value values via + +want wanted wanting was way we we've well what what's when where whereabouts +whenever whether which while who whose why will with within without wish won't +wondered work worked working works would wrong + +xxx + +yet yyy + +"; +######################################################################## + + +# The regular expression fragment that defines the separator between words + +$wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*"; + + +######################################################################## +# Function to add to a length to accommodate HTML stuff + +sub setlen{ +my($len, $s) = @_; + +$len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig); +$len += 1 while ($s =~ /&#\d+;/g); + +return $len; +} + + +######################################################################## +# Function to write out the list of initials with references + +sub write_initials { +my($this_initial) = "$_[0]"; + +print OUT "<p>\n "; + +foreach $initial (sort keys %initials) + { + if ($initial eq $this_initial) + { + print OUT " <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> "; + } + else + { + print OUT "<a href=\"FAQ-KWIC_$initial.html\"> $initial</a>"; + } + } + +print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n"; +} + + + +######################################################################## +# The main program. We can pick out the contents lines because they lie +# between <li> and </li> in the file, sometimes on more than one physical +# line. + +# Turn the list of ignorable words into a hash for quick lookup. Add the +# empty word to the list. + +@words = split /\s+/, $ignore_list; +foreach $word (@words) { $ignore{$word} = 1; } +$ignore{""} = 1; + + +# Open the file and do the job + +open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n"; + +while (<IN>) + { + next unless /^<li>/; + $_ .= <IN> while !/<\/li>$/; + chomp; + s/\n\s*/ /g; + + # Extract the operative text into $text, with the beginning in $pre. + + my($pre,$text,$post) = /^<li>(.*<\/a>:(?: )*)(.*)<br><br><\/li>$/; + + # Now split into words. As well as punctuation, there may be HTML thingies + # between words. Absorb them into the separators. + + my(@words) = split /$wordgap/, $text; + + # Lower case all the words, and remove those that we don't want. + # Then keep a list of all the used initials. + + REMOVE_IGNORE: + for ($i = 0; $i < scalar @words; $i++) + { + my($word) = $words[$i] = "\L$words[$i]\E"; + + # Remove certain forms of word and those on the ignore list + + if (defined $ignore{$word} || # word on ignore list + $word =~ /^-+$/ || # word consists entirely of hyphens + $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter + $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen + $word =~ /[@^.]/ # contains @ or ^ or . + ) + { + splice(@words, $i, 1); + redo REMOVE_IGNORE if $i < scalar @words; + } + + # Otherwise, build up a list of initials + + else + { + my($inword) = $word; + $inword =~ s/^-//; + $initial = substr($inword, 0, 1); + $initials{"\U$initial\E"} = 1; + } + } + + # Create the lines for the KWIC index, and store them in associative + # arrays, with the keyword as the key. That will get them sorted + # automatically. + + while (scalar @words > 0) + { + my($word) = shift @words; + my($pretext, $casedword, $posttext) = + $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i; + + # Remove a leading hyphen from $word so that it sorts according to + # the leading letter. What is actually output is $casedword, which + # retains the hyphen. + + $word =~ s/^-//; + + my($prelen) = length $pretext; + my($postlen) = length $posttext; + + # We want to chop excessively long entries on either side. We can't set + # a fixed length because of the HTML control data. Call a function to + # add the given length to allow for HTML stuff. This is crude, but it + # does roughtly the right thing. + + my($leftlen) = &setlen(70, $pretext); + my($rightlen) = &setlen(70, $posttext); + + if ($prelen > $leftlen) + { + my($cutoff) = $leftlen; + $cutoff++ + while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " "); + $pretext = "... " . substr($pretext, -$cutoff); + } + + if ($postlen > $rightlen) + { + my($cutoff) = $rightlen; + $cutoff++ + while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " "); + $posttext = substr($posttext, 0, $cutoff) . "..."; + } + + # If the pre text has a font-ending not preceded by a font beginning + # (i.e. we've chopped the beginning off), we must insert a beginning. + + while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/) + { + $pretext = "<$2>" . $pretext; + } + + # If the pre text ends in a special font, we have to terminate that, + # and reset it at the start of the post text. + + my($poststart) = ""; + + while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/) + { + $pretext .= "</$1>"; + $poststart .= "<$1>"; + } + + # If the post text changes font but doesn't close it, we must add + # the closure. + + while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/) + { + $posttext .= "</$1>"; + } + + # Remove any unnecessary changes in either of them + + $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g; + $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g; + + # Save the texts in associative arrays. Add the question number to + # the end of the word to make the key. + + $pre =~ /(Q\d\d\d\d)/; + my($key) = "$word-$1"; + + $tableft{$key} = $pre . $pretext; + $tabright{$key} = $poststart . + "<font color=\"#FF0A0A\">$casedword</font>" . $posttext; + } + } + +close(IN); + +# Now write out the files. Each letter in the index goes in a different file + +$current_initial = ""; + +foreach $key (sort keys %tableft) + { + my($initial) = $key =~ /^(.)/; + $initial = "\U$initial\E"; + + if ($initial ne $current_initial) + { + if ($current_initial ne "") + { + print OUT "</table>\n"; + &write_initials($current_initial); + print OUT "</body>\n</html>\n"; + close OUT; + } + + open (OUT, ">html/FAQ-KWIC_$initial.html") || + die "Can't open html/FAQ-KWIC_$initial.html\n"; + print OUT + "<html>\n" . + "<head>\n" . + "<title>Exim FAQ: KWIC index section $initial</title>\n" . + "</head>\n" . + "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" . + "<h1>Exim FAQ: Keyword-in-context index</h1>\n"; + + write_initials($initial); + + if ($initial eq "A") + { + print OUT <<End ; +<p> +This <i>Keyword-in-context</i> index for the Exim FAQ is generated +automatically from the FAQ source. Browsers may not display the data very +prettily, but it is hoped that it may provide a useful aid for finding things +in the FAQ. +</p> +End + } + + print OUT "<table border>\n"; + $current_initial = $initial; + } + + print OUT "<tr>\n"; + print OUT "<td align=\"right\">$tableft{$key}</td>\n"; + print OUT "<td align=\"left\">$tabright{$key}</td>\n"; + print OUT "</tr>\n"; + } + +# Close the final file + +if ($current_initial ne "") + { + print OUT "</table>\n"; + &write_initials($current_initial); + print OUT "</body>\n</html>\n"; + close OUT; + } + +# End |