doc/doc-docbook/TidyHTML-spec


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191

#! /usr/bin/perl

# $Cambridge: exim/doc/doc-docbook/TidyHTML-spec,v 1.4 2006/04/04 14:03:49 ph10 Exp $

# Script to tidy up the spec HTML files that are generated by xmlto. The
# following changes are made:
#
# 1. Tidy the index.html file by splitting the very long lines.
# 2. Create reverse links from chapter and section titles back to the TOC.
# 3. Tidy the ix01.html file - the actual index - by splitting long lines.
# 4. Insert links from the letter divisions to the top of the Index.
# 5. Turn <div class="literallayout"><p> into <div class="literallayout"> and
#    a matching </p></div> into </div> to get rid of unwanted vertical white
#    space.
# 6. Before each occurrence of </td> insert &nbsp; so that the table's cell
#    is a little bit wider than the text itself.

chdir "spec_html";

$tocref = 1;

# Read in the index.html file. It's really the TOC.

open(IN, "index.html") || die "Failed to open index.html for reading: $!\n";
@toc = <IN>;
close(IN);

# Insert a newline after every > except when it is preceded by 'class="quote"',
# because the whole toc is generated as one humungous line that is hard to
# check. We have to avoid it in the quote case because that puts a space into
# the output, and similarly for the </span> the comes afterwards. Easy way out
# is just not to do it for all </span> occurrences. Unfortunately, Perl does
# not implement lookbehinds where the alternatives are of different lengths, so
# we have to take two passes.


foreach $line (@toc)
  {
  $line =~ s/(?<!class="quote")>\s*/>\n/g;
  $line =~ s/<\/span>\n/<\/span>/g;
  }

# Split the lines so that each one is a separate element in the vector.

for ($i = 0; $i < scalar(@toc); $i++)
  { splice @toc, $i, 1, (split /(?<=\n)/, $toc[$i]); }

# We want to create reverse links from each chapter and section title back to
# the relevant place in the TOC. Scan the TOC for the relevant entries. Add
# an id to each entry, and create tables that remember the file names and the
# new link ids.

foreach $line (@toc)
  {
  if ($line =~ /^<a href="((?:ch|ix)\d+\.html)(#[^"]+)?">/)
    {
    my($chix) = $1;
    my($ss) = $2;
    my($id) = sprintf "%04d", $tocref++;
    $line =~ s/<a/<a id="toc$id"/;
    $backref{"$chix$ss"} = "toc$id";
    push @chlist, $chix;
    }
  }

# Write out the modified index.html file.

open (OUT, ">index.html") || die "Failed to open index.html for writing: $!\n";
print OUT @toc;
close(OUT);

# Now scan each of the other page files and insert the reverse links. While
# we are at it, we tidy up <div class="literallayout"> by removing unwanted
# paragraph marks, which generate unwanted vertical space. We also insert
# &nbsp; before </td> to push table cells apart from each other.

foreach $file (@chlist)
  {
  open(IN, "$file") || die "Failed to open $file for reading: $!\n";
  @text = <IN>;
  close(IN);

  # Insert a newline after certain elements, and split the lines so that each
  # one is a separate element in the vector. This makes it easier to recognize
  # these elements.

  foreach $line (@text)
    {
    $line =~ s/<p>\s*(?!\n)/<p>\n/g;
    $line =~ s/<\/p>\s*(?!\n)/<\/p>\n/g;
    $line =~ s/<\/div>\s*(?!\n)/<\/div>\n/g;
    $line =~ s/<div([^>]*)>(?!\n)/<div$1>\n/g;
    }

  for ($i = 0; $i < scalar(@text); $i++)
    { splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); }

  $thisdiv = 0;

  for ($i = 0; $i < scalar(@text); $i++)
    {
    if ($text[$i] =~ /^(.*?)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.+?)<\/h(.*)$/)
      {
      my($pre, $opt, $id, $title, $post) = ($1, $2, $3, $4, $5);

      # Section reference
      my($ref) = $backref{"$file#$id"};

      # If not found, try for a chapter reference
      $ref = $backref{"$file"} if !defined $ref;

      # Adjust the line
      $text[$i]= "$pre<a$opt href=\"index.html#$ref\" id=\"$id\">$title</a></h$post";
      }

    elsif ($text[$i] =~ /^<div [^>]*?class="literallayout">$/ && $text[$i+1] eq "<p>\n")
      {
      $text[++$i] = "";
      $thisdiv = 1;
      }
    elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n")
      {
      $text[$i] = "";
      $thisdiv = 0;
      }
    elsif ($text[$i] =~ /^\s*<\/td>/)
      {
      $text[$i] = "&nbsp;$text[$i]";
      }
    }

  open(OUT, ">$file") || die "Failed to open $file for writing: $!\n";
  print OUT @text;
  close(OUT);
  }

# Now process the ix01.html file

open(IN, "ix01.html") || die "Failed to open ix01.html for reading: $!\n";
@index = <IN>;
close(IN);

# Insert a newline after every > because the whole index is generated as one
# humungous line that is hard to check. Then split the lines so that each one
# is a separate element in the vector.

foreach $line (@index) { $line =~ s/>\s*/>\n/g; }
for ($i = 0; $i < scalar(@index); $i++)
  { splice @index, $i, 1, (split /(?<=\n)/, $index[$i]); }

# We want to add a list of letters at the top of the index, and link back
# to them from each letter heading. First find the index title and remember
# where to insert the list of letters.

for ($i = 0; $i < scalar(@index); $i++)
  {
  if ($index[$i] =~ /^<\/h2>$/)
    {
    $listindex = $i;
    last;
    }
  }

# Now scan through for the letter headings and build the cross references,
# while also building up the list to insert.

$list = "<h4>\n";
for (; $i < scalar(@index); $i++)
  {
  if ($index[$i] =~ /^(.)<\/h3>$/)
    {
    $letter = $1;
    $index[$i-1] =~ s/^/<a id="${letter}B" href="#${letter}T">/;
    $index[$i] =~ s/$/<\/a>/;
    $list .= "<a id=\"${letter}T\" href=\"#${letter}B\"> $letter</a>\n";
    }
  }

# Now we know which letters we have, we can insert the list.

$list .= "</h4>\n";
splice @index, $listindex, 0, $list;

# Write out the modified index.html file.

open (OUT, ">ix01.html") || die "Failed to open ix01.html for writing: $!\n";
print OUT @index;
close(OUT);


# End