#! /usr/bin/perl -w # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $ # Script to read the HTML table of contents for the Exim FAQ and create an # HTML KWIC index out of it. ######################################################################## # List of words to ignore - kept alphabetically for reference, but they # don't have to be in order. $ignore_list = " a ability able about absence access according actual address addresses addressed affect affected after against aka all allow allowed allows along already also although always am amount an ancient and and/or annoying another any anybody anyone anything anywhere apparent apparently are aren't around arrange arrive arrives as at back bad based basically be because been behave behaviour being best between bob both box bug build builds built busy but by call called calls can can't cannot causes causing central certain code comes coming command commands complain complaining complains configure configured conjunction contact contain contains contained correct correctly could currently customer day days defined deliver delivers delivered delivery deliveries did do does doesn't doing don't down during e-mail e-mails each easy either else email emails entirely entries entry especially etc even ever every example exim exim's experiencing far few file files find finds fine fix fixed fly following for form found from fully generate generated get gets getting given gives giving go goes going got handle handles handled handling happen happens has have haven't having helpful him host hosts how however i i'd i'm i've if in indeed instead into is issue issues isn't it it's its jim just keep keeps know knows like line lines look looked looking lot m machine machines machine's mail mails main make me mean means message messages might more much must my myself near need neither no nor not now occur of off often ok on one only or other our out over own part parts particular per place possibility possible present problem problems put puts quite raised rather really reason rid right round run runs same say saying see seeing seem seems seen sees set setting she should simply sit so some somehow something sometimes stand state statement still strange such supposed system systems take takes tell than that the their them then there these they things think this those thought to try though to/for told too tried tries trying under until up use uses used using usually valid value values via want wanted wanting was way we we've well what what's when where whereabouts whenever whether which while who whose why will with within without wish won't wondered work worked working works would wrong xxx yet yyy "; ######################################################################## # The regular expression fragment that defines the separator between words $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*"; ######################################################################## # Function to add to a length to accommodate HTML stuff sub setlen{ my($len, $s) = @_; $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig); $len += 1 while ($s =~ /\d+;/g); return $len; } ######################################################################## # Function to write out the list of initials with references sub write_initials { my($this_initial) = "$_[0]"; print OUT "
\n "; foreach $initial (sort keys %initials) { if ($initial eq $this_initial) { print OUT " $initial "; } else { print OUT " $initial"; } } print OUT " "x4 . "FAQ Contents\n
\n"; } ######################################################################## # The main program. We can pick out the contents lines because they lie # between