_winreg (Windows)"
}
else
{ $a->dump;
die "Expected just one content of in - : @acontent"; }
}
if (ref $acontent[0])
{ if (($acontent[0]->tag eq "code") || ($acontent[0]->tag eq "tt"))
{ @acontent = @{$acontent[0]->content}; }
else
{ $acontent[0]->dump;
die "Expected string content of in
- : $acontent[0]"; } }
if (!defined($acontent))
{ $acontent = $index_prefix . $acontent[0];
$acontent_suffix = $acontent[0]; }
elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
if (!defined $ahref)
{ $dt->dump;
die "no HREF in nachor in
- "; }
my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
if (!defined $ahref_name)
{ # Reference to entire file
$ahref_name = ""; }
if ($ahref_name eq $l2h_broken_link_name)
{ if (!exists $file_index_entries_broken{$ahref_file})
{ $file_index_entries_broken{$ahref_file} = []; }
push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
next; }
if (!exists $file_index_entries{$ahref_file})
{ $file_index_entries{$ahref_file} = {}; }
# Don't do this! It appears to make a copy, which is not desired.
# my %index_entries = %{$file_index_entries{$ahref_file}};
if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
# if ($acontent eq $oldcontent)
# { die "Multiple identical index entries?"; }
# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
}
}
sub process_index_dt_and_dd ( $$ )
{ my ($dt, $dd) = check_args(2, @_);
my $dtcontent;
{ my @dtcontent = @{$dt->content()};
if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
{ $dd->dump;
$dt->dump;
die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of
- : @dtcontent"; }
$dtcontent = $dtcontent[0];
$dtcontent =~ s/ +$//; }
my $ddcontent;
{ my @ddcontent = @{$dd->content()};
if (scalar(@ddcontent) != 1)
{ die "Expected single
- content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
$ddcontent = $ddcontent[0]; }
if ($ddcontent->tag ne "dl")
{ die "Expected
as content of - , but saw: $ddcontent"; }
push @index_prefixes, $index_prefix;
$index_prefix .= $dtcontent . ", ";
process_index_dl_compact($ddcontent);
$index_prefix = pop(@index_prefixes);
}
###########################################################################
### Ordinary sections
###
sub process_section_file ( $$$ )
{ my ($file, $depth, $nodetitle) = check_args(3, @_);
my $he = file_to_tree(($file =~ /\//) ? $file : $html_directory . $file);
# print STDERR "process_section_file: $file $depth $nodetitle\n";
# Equivalently:
# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
@section_stack = @section_stack[0..$depth-1];
# Not a great nodename fixup scheme; need a more global view
if ((defined $contents_fixups{$nodetitle})
&& (scalar(@section_stack) > 0))
{ my $up_title = $section_stack[$#section_stack];
# hack for Python Standard Library
$up_title =~ s/^(Built-in|Standard) Module //g;
my ($up_first_word) = split(/ /, $up_title);
$nodetitle = "$up_first_word $nodetitle";
}
push @section_stack, $nodetitle;
# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
$he->traverse(\&process_if_child_links, 'ignore text nodes');
%footnotes = ();
# $he->dump;
$he->traverse(\&process_if_footnotes, 'ignore text nodes');
# $he->dump;
if (exists $file_index_entries{$file})
{ %this_index_entries = %{$file_index_entries{$file}};
# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
}
else
{ # print STDERR "Warning: no index entries for file $file\n";
%this_index_entries = (); }
if (exists $file_index_entries_broken{$file})
{ @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
else
{ # print STDERR "Warning: no broken index entries for file $file\n";
@this_index_entries_broken = (); }
if ($he->tag() ne "html")
{ die "Expected at top level"; }
my @content = @{$he->content()};
if ((!ref $content[0]) or ($content[0]->tag ne "head"))
{ $he->dump;
die " not first element of "; }
if ((!ref $content[1]) or ($content[1]->tag ne "body"))
{ $he->dump;
die " not second element of "; }
$content[1]->traverse(\&output_body);
}
# stack of things we're inside that are preventing indexing from occurring now.
# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
my @index_deferrers = ();
sub push_or_pop_index_deferrers ( $$ )
{ my ($tag, $startflag) = check_args(2, @_);
if ($startflag)
{ push @index_deferrers, $tag; }
else
{ my $old_deferrer = pop @index_deferrers;
if ($tag ne $old_deferrer)
{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
do_deferred_index_entries(); }
}
sub label_add_index_entries ( $;$ )
{ my ($label, $he) = check_args_range(1, 2, @_);
# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
# $he is the anchor element
if (exists $this_index_entries{$label})
{ push @deferred_index_entries, @{$this_index_entries{$label}};
return; }
if ($label eq $l2h_broken_link_name)
{ # Try to find some text to use in guessing which links should point here
# I should probably only look at the previous element, or if that is
# all punctuation, the one before it; collecting all the previous texts
# is a bit of overkill.
my @anchor_texts = collect_texts($he);
my @previous_texts = collect_texts($he->parent, $he);
# 4 elements is arbitrary; ought to filter out punctuation and small words
# first, then perhaps keep fewer. Perhaps also filter out formatting so
# that we can see a larger chunk of text? (Probably not.)
# Also perhaps should do further chunking into words, in case the
# index term isn't a chunk of its own (eg, was in
...
.
my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
my $guessed = 0;
for my $text (@candidate_texts)
{ # my $orig_text = $text;
if ($text =~ /^[\"\`\'().?! ]*$/)
{ next; }
if (length($text) <= 2)
{ next; }
# hack for Python manual; maybe defer until failure first time around?
$text =~ s/^sys\.//g;
for my $iterm (@this_index_entries_broken)
{ # I could test for zero: LaTeX2HTML's failures in the Python
# documentation are only for items of the form "... (built-in...)"
if (index($iterm, $text) != -1)
{ push @deferred_index_entries, $iterm;
# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
$guessed = 1;
} } }
if (!$guessed)
{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
}
}
}
# Need to add calls to this at various places.
# Perhaps add HTML::Element argument and do the check for appropriateness
# here (ie, no action if inside , etc.).
sub do_deferred_index_entries ()
{ check_args(0, @_);
if ((scalar(@deferred_index_entries) > 0)
&& (scalar(@index_deferrers) == 0))
{ print TEXI "\n", join("\n", @deferred_index_entries), "\n";
@deferred_index_entries = (); }
}
my $table_columns; # undefined if not in a table
my $table_first_column; # boolean
sub output_body ( $$$;$$ )
{
# The traverse() from HTML::Element 1.54 calls this with 5 arguments;
# the last two are the parent element and the index of this in its parent.
# We will ignore the depth, parent, and index_in_parent arguments.
my ($he, $startflag) = (check_args_range(3, 5, @_))[0,1];
if (!ref $he)
{ my $space_index = index($he, " ");
if ($space_index != -1)
{ # Why does
# print TEXI texi_quote(substr($he, 0, $space_index+1));
# give: Can't locate object method "TEXI" via package "texi_quote"
# (Because the definition texi_quote hasn't been seen yet.)
print TEXI &texi_quote(substr($he, 0, $space_index+1));
do_deferred_index_entries();
print TEXI &texi_quote(substr($he, $space_index+1)); }
else
{ print TEXI &texi_quote($he); }
return; }
my $tag = $he->tag();
# Ordinary text markup first
if (exists $inline_markup{$tag})
{ if ($startflag)
{ print TEXI "\@$inline_markup{$tag}\{"; }
else
{ print TEXI "\}"; } }
elsif ($tag eq "a")
{ my ($name, $href, @content) = anchor_info($he);
if (!$href)
{ # This anchor is only here for indexing/cross referencing purposes.
if ($startflag)
{ label_add_index_entries($name, $he); }
}
elsif ($href =~ "^(ftp|http|news):")
{ if ($startflag)
{ # Should avoid second argument if it's identical to the URL.
print TEXI "\@uref\{$href, "; }
else
{ print TEXI "\}"; }
}
elsif ($href =~ /^\#(foot[0-9]+)$/)
{ # Footnote
if ($startflag)
{ # Could double-check name and content, but I'm not
# currently storing that information.
print TEXI "\@footnote\{";
$footnotes{$1}->traverse(\&output_body);
print TEXI "\}";
return 0; } }
else
{ if ($startflag)
{ # cross-references are not active Info links, but no text is lost
if ($missing_feature_warning) {
print STDERR "Can't deal with internal HREF anchors yet:\n";
$he->dump;
}
}
}
}
elsif ($tag eq "br")
{ print TEXI "\@\n"; }
elsif ($tag eq "body")
{ }
elsif ($tag eq "center")
{ if (has_single_content_string($he)
&& ($ {$he->content}[0] =~ /^ *$/))
{ return 0; }
## There is no "center" environment; it only affects the current line
# if ($startflag)
# { print TEXI "\n\@center\n"; }
# else
# { print TEXI "\n\@end center\n"; }
}
elsif ($tag eq "div")
{ my $align = $he->attr('align');
if (defined($align) && ($align eq "center"))
{ if (has_single_content_string($he)
&& ($ {$he->content}[0] =~ /^ *$/))
{ return 0; }
if ($startflag)
{ print TEXI "\n\@center\n"; }
else
{ print TEXI "\n\@end center\n"; } }
}
elsif ($tag eq "dl")
{ # Recognize " ...
" paradigm for "@example"
if (has_single_content_with_tag($he, "dd"))
{ my $he_dd = $ {$he->content}[0];
if (has_single_content_with_tag($he_dd, "pre"))
{ my $he_pre = $ {$he_dd->content}[0];
print_pre($he_pre);
return 0; } }
if ($startflag)
{ # Could examine the elements, to be cleverer about formatting.
# (Also to use ftable, vtable...)
print TEXI "\n\@table \@asis\n"; }
else
{ print TEXI "\n\@end table\n"; }
}
elsif ($tag eq "dt")
{ push_or_pop_index_deferrers($tag, $startflag);
if ($startflag)
{ print TEXI "\n\@item "; }
else
{ } }
elsif ($tag eq "dd")
{ if ($startflag)
{ print TEXI "\n"; }
else
{ }
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
do_deferred_index_entries();
}
elsif ($tag =~ /^(font|big|small)$/)
{ # Do nothing for now.
}
elsif ($tag =~ /^h[1-6]$/)
{ # We don't need this because we never recursively enter the heading content.
# push_or_pop_index_deferrers($tag, $startflag);
my $secname = "";
my @seclabels = ();
for my $elt (@{$he->content})
{ if (!ref $elt)
{ $secname .= $elt; }
elsif ($elt->tag eq "br")
{ }
elsif ($elt->tag eq "a")
{ my ($name, $href, @acontent) = anchor_info($elt);
if ($href)
{ $he->dump;
$elt->dump;
die "Nonsimple anchor in <$tag>"; }
if (!defined $name)
{ die "No NAME for anchor in $tag"; }
push @seclabels, $name;
for my $subelt (@acontent)
{ $secname .= html_to_texi($subelt); } }
else
{ $secname .= html_to_texi($elt); } }
if ($secname eq "")
{ die "No section name in <$tag>"; }
# print STDERR "section_stack for <$tag>$secname$tag>: ", join(", ", @section_stack), "\n";
if (scalar(@section_stack) == 1)
{ if ($section_stack[-1] ne "Top")
{ die "Not top? $section_stack[-1]"; }
print TEXI "\@settitle $secname\n";
print TEXI "\@c %**end of header\n";
print TEXI "\n";
print TEXI "\@node Top\n";
print TEXI "\n"; }
else
{ print TEXI "\n\@node $section_stack[-1]\n";
print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
for my $seclabel (@seclabels)
{ label_add_index_entries($seclabel); }
# This should only happen once per file.
label_add_index_entries("");
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
do_deferred_index_entries();
return 0;
}
elsif ($tag eq "hr")
{ }
elsif ($tag eq "ignore")
{ # Hack for ignored elements
return 0;
}
elsif ($tag eq "li")
{ if ($startflag)
{ print TEXI "\n\n\@item\n";
do_deferred_index_entries(); } }
elsif ($tag eq "ol")
{ if ($startflag)
{ print TEXI "\n\@itemize \@bullet\n"; }
else
{ print TEXI "\n\@end itemize\n"; } }
elsif ($tag eq "p")
{ if ($startflag)
{ print TEXI "\n\n"; }
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
do_deferred_index_entries(); }
elsif ($tag eq "pre")
{ print_pre($he);
return 0; }
elsif ($tag eq "table")
{ # Could also indicate common formatting for first column, or
# determine relative widths for columns (or determine a prototype row)
if ($startflag)
{ if (defined $table_columns)
{ $he->dump;
die "Can't deal with table nested inside $table_columns-column table"; }
$table_columns = table_columns($he);
if ($table_columns < 2)
{ $he->dump;
die "Column with $table_columns columns?"; }
elsif ($table_columns == 2)
{ print TEXI "\n\@table \@asis\n"; }
else
{ print TEXI "\n\@multitable \@columnfractions";
for (my $i=0; $i<$table_columns; $i++)
{ print TEXI " ", 1.0/$table_columns; }
print TEXI "\n"; } }
else
{ if ($table_columns == 2)
{ print TEXI "\n\@end table\n"; }
else
{ print TEXI "\n\@end multitable\n"; }
undef $table_columns; } }
elsif (($tag eq "td") || ($tag eq "th"))
{ if ($startflag)
{ if ($table_first_column)
{ print TEXI "\n\@item ";
$table_first_column = 0; }
elsif ($table_columns > 2)
{ print TEXI "\n\@tab "; } }
else
{ print TEXI "\n"; } }
elsif ($tag eq "tr")
{ if ($startflag)
{ $table_first_column = 1; } }
elsif ($tag eq "ul")
{ if ($startflag)
{ print TEXI "\n\@itemize \@bullet\n"; }
else
{ print TEXI "\n\@end itemize\n"; } }
else {
if ($missing_feature_warning) {
# I used to have a newline before "output_body" here.
print STDERR "output_body: ignoring <$tag> tag\n";
$he->dump;
return 0;
}
}
return 1;
}
sub print_pre ( $ )
{ my ($he_pre) = check_args(1, @_);
if (!has_single_content_string($he_pre))
{ die "Multiple or non-string content for : ", @{$he_pre->content}; }
my $pre_content = $ {$he_pre->content}[0];
print TEXI "\n\@example";
print TEXI &texi_quote($pre_content);
print TEXI "\@end example\n";
}
sub table_columns ( $ )
{
my ($table) = check_args(1, @_);
my $result = 0;
for my $row (@{$table->content}) {
if (($row->tag eq "thead") || ($row->tag eq "tbody") || ($row->tag eq "p")) {
$result = max($result, table_columns($row));
} elsif ($row->tag ne "tr") {
$table->dump;
$row->dump;
die "Expected as table row.";
}
$result = max($result, scalar(@{$row->content}));
}
return $result;
}
###########################################################################
### Utilities
###
sub min ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x < $y) ? $x : $y;
}
sub max ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x > $y) ? $x : $y;
}
sub file_to_tree ( $ )
{ my ($file) = check_args(1, @_);
my $tree = new HTML::TreeBuilder;
$tree->ignore_unknown(1);
# $tree->warn(1);
$tree->parse_file($file);
cleanup_parse_tree($tree);
return $tree
}
sub has_single_content ( $ )
{ my ($he) = check_args(1, @_);
if (!ref $he)
{ # return 0;
die "Non-reference argument: $he"; }
my $ref_content = $he->content;
if (!defined $ref_content)
{ return 0; }
my @content = @{$ref_content};
if (scalar(@content) != 1)
{ return 0; }
return 1;
}
# Return true if the content of the element contains only one element itself,
# and that inner element has the specified tag.
sub has_single_content_with_tag ( $$ )
{ my ($he, $tag) = check_args(2, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (!ref $content)
{ return 0; }
my $content_tag = $content->tag;
if (!defined $content_tag)
{ return 0; }
return $content_tag eq $tag;
}
sub has_single_content_string ( $ )
{ my ($he) = check_args(1, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (ref $content)
{ return 0; }
return 1;
}
# Return name, href, content. First two may be undefined; third is an array.
# I don't see how to determine if there are more attributes.
sub anchor_info ( $ )
{ my ($he) = check_args(1, @_);
if ($he->tag ne "a")
{ $he->dump;
die "passed non-anchor to anchor_info"; }
my $name = $he->attr('name');
my $href = $he->attr('href');
my @content = ();
{ my $ref_content = $he->content;
if (defined $ref_content)
{ @content = @{$ref_content}; } }
return ($name, $href, @content);
}
sub texi_quote ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/([\@\{\}])/\@$1/g;
$text =~ s/ -- / --- /g;
return $text;
}
# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
sub texi_remove_punctuation ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/^ +//g;
$text =~ s/[ :]+$//g;
$text =~ s/^[1-9][0-9.]* +//g;
$text =~ s/,//g;
# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
# gets converted into " - ", just as "---" would be converted into " -- ",
# so the names end up differing.)
# $text =~ s/:/ -- /g;
$text =~ s/://g;
return $text;
}
## Do not use this inside `traverse': it throws off the traversal. Use
## html_replace_by_ignore or html_replace_by_meta instead.
# Returns 1 if success, 0 if failure.
sub html_remove ( $;$ )
{ my ($he, $parent) = check_args_range(1, 2, @_);
if (!defined $parent)
{ $parent = $he->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $iparent(undef);
return 1; } }
die "Didn't find $he in $parent";
}
sub html_replace ( $$;$ )
{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
if (!defined $parent)
{ $parent = $orig->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $iparent($parent);
$orig->parent(undef);
return 1; } }
die "Didn't find $orig in $parent";
}
sub html_replace_by_meta ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $meta = new HTML::Element "meta";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $meta, $parent);
}
sub html_replace_by_ignore ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $ignore = new HTML::Element "ignore";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $ignore, $parent);
}
###
### Collect text elements
###
my @collected_texts;
my $collect_texts_stoppoint;
my $done_collecting;
sub collect_texts ( $;$ )
{ my ($root, $stop) = check_args_range(1, 2, @_);
# print STDERR "collect_texts: $root $stop\n";
$collect_texts_stoppoint = $stop;
$done_collecting = 0;
@collected_texts = ();
$root->traverse(\&collect_if_text); # process texts
# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
return @collected_texts;
}
sub collect_if_text ( $$$;$$ )
{ my $he = (check_args_range(3, 5, @_))[0]; # ignore depth, startflag, etc. arguments
if ($done_collecting)
{ return 0; }
if (!defined $he)
{ return 0; }
if (!ref $he)
{ push @collected_texts, $he;
return 0; }
if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
{ $done_collecting = 1;
return 0; }
return 1;
}
###########################################################################
### Clean up parse tree
###
sub cleanup_parse_tree ( $ )
{ my ($he) = check_args(1, @_);
# print "Before nav delete:\n";
# $he->dump;
$he->traverse(\&delete_if_navigation, 'ignore text nodes');
# print "After nav delete:\n";
# $he->dump;
$he->traverse(\&delete_extra_spaces, 'ignore text nodes');
$he->traverse(\&merge_dl, 'ignore text nodes');
$he->traverse(\&reorder_dt_and_dl, 'ignore text nodes');
return $he;
}
## Simpler version that deletes contents but not the element itself.
# sub delete_if_navigation ( $$$ )
# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
# { $he->delete();
# return 0; }
# else
# { return 1; }
# }
sub delete_if_navigation ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
{ my $ref_pcontent = $he->parent()->content();
# Don't try to modify @pcontent, which appears to be a COPY.
# my @pcontent = @{$ref_pcontent};
for (my $i = 0; $idelete();
return 0; }
else
{ return 1; }
}
sub delete_extra_spaces ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
my $tag = $he->tag;
if ($tag =~ /^(head|html|table|tr|ul)$/)
{ delete_child_spaces($he); }
delete_trailing_spaces($he);
return 1;
}
sub delete_child_spaces ( $ )
{ my ($he) = check_args(1, @_);
my $ref_content = $he->content();
if (! defined $ref_content)
{ return; }
for (my $i = 0; $icontent();
if (! defined $ref_content)
{ return; }
# Could also check for previous element = /^h[1-6]$/.
for (my $i = 0; $itag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
{ splice(@{$ref_content}, $i, 1);
$i--; } } }
if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
{ my $last_elt = $ {$ref_content}[$#{$ref_content}];
if ((defined $last_elt) && ($last_elt =~ /^ *$/))
{ pop @{$ref_content}; } }
}
# LaTeX2HTML sometimes creates
# - text
#
- text
# which should actually be:
#
# - text
#
- text
# Since a
gets added, this ends up looking like
#
#
# -
# text1...
#
# -
# text2...
# dt_or_dd1...
# dt_or_dd2...
# which should become
#
#
# -
# text1...
#
-
# text2...
# dt_or_dd1...
# dt_or_dd2...
sub reorder_dt_and_dl ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if ($he->tag() eq "p")
{ my $ref_pcontent = $he->content();
if (defined $ref_pcontent)
{ my @pcontent = @{$ref_pcontent};
# print "reorder_dt_and_dl found a
\n"; $he->dump();
if ((scalar(@pcontent) >= 1)
&& (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
&& $pcontent[0]->implicit())
{ my $ref_dlcontent = $pcontent[0]->content();
# print "reorder_dt_and_dl found a
and implicit
\n";
if (defined $ref_dlcontent)
{ my @dlcontent = @{$ref_dlcontent};
if ((scalar(@dlcontent) >= 1)
&& (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
{ my $ref_dtcontent = $dlcontent[0]->content();
# print "reorder_dt_and_dl found a , implicit
, and - \n";
if (defined $ref_dtcontent)
{ my @dtcontent = @{$ref_dtcontent};
if ((scalar(@dtcontent) > 0)
&& (ref $dtcontent[$#dtcontent])
&& ($dtcontent[$#dtcontent]->tag() eq "dl"))
{ my $ref_dl2content = $dtcontent[$#dtcontent]->content();
# print "reorder_dt_and_dl found a
, implicit
, - , and
\n";
if (defined $ref_dl2content)
{ my @dl2content = @{$ref_dl2content};
if ((scalar(@dl2content) > 0)
&& (ref ($dl2content[0]))
&& ($dl2content[0]->tag() eq "dd"))
{
# print "reorder_dt_and_dl found a , implicit
, - ,
, and - \n";
# print STDERR "CHANGING\n"; $he->dump();
html_replace_by_ignore($dtcontent[$#dtcontent]);
splice(@{$ref_dlcontent}, 1, 0, @dl2content);
# print STDERR "CHANGED TO:\n"; $he->dump();
return 0; # don't traverse children
} } } } } } } } }
return 1;
}
# If we find a paragraph that looks like
#
#
#
# then accumulate its links into a contents_list and delete the paragraph.
sub process_if_child_links ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if ($he->tag() eq "p")
{ my $ref_content = $he->content();
if (defined $ref_content)
{ my @content = @{$ref_content};
if ((scalar(@content) == 2)
&& (ref $content[0]) && $content[0]->tag() eq "hr"
&& (ref $content[1]) && $content[1]->tag() eq "ul")
{ process_child_links($he);
$he->delete();
return 0; } } }
return 1;
}
# If we find
#
# "Footnotes"
#
# -
#
# "...borrow"
#
# "1.2"
#
-
# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
# ...
# then record the footnote information and delete the section and list.
my $process_if_footnotes_expect_dl_next = 0;
sub process_if_footnotes ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "h4")
&& has_single_content_string($he)
&& ($ {$he->content}[0] eq "Footnotes"))
{ html_replace_by_ignore($he);
$process_if_footnotes_expect_dl_next = 1;
return 0; }
if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
{ my $ref_content = $he->content();
if (defined $ref_content)
{ $process_if_footnotes_expect_dl_next = 0;
my @content = @{$ref_content};
for (my $i=0; $i<$#content; $i+=2)
{ my $he_dt = $content[$i];
my $he_dd = $content[$i+1];
if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
{ $he->dump;
die "expected
- and
- at positions $i and ", $i+1; }
my @dt_content = @{$he_dt->content()};
if ((scalar(@dt_content) != 2)
|| ($dt_content[0]->tag ne "a")
|| ($dt_content[1]->tag ne "a"))
{ $he_dt->dump;
die "Expected 2 anchors as content of
- "; }
my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
if ($dt1_name ne $dt2_name)
{ $he_dt->dump;
die "Expected identical names for anchors"; }
html_replace_by_ignore($he_dd);
$he_dd->tag("div"); # has no effect
$footnotes{$dt1_name} = $he_dd; }
html_replace_by_ignore($he);
return 0; } }
if ($process_if_footnotes_expect_dl_next)
{ $he->dump;
die "Expected
for footnotes next"; }
return 1;
}
## Merge two adjacent paragraphs containing items, such as:
#
#
# -
# ...
#
-
# ...
#
#
# -
# ...
#
-
# ...
sub merge_dl ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
my $ref_content = $he->content;
if (!defined $ref_content)
{ return; }
my $i = 0;
while ($i < scalar(@{$ref_content})-1)
{ my $p1 = $ {$ref_content}[$i];
if ((ref $p1) && ($p1->tag eq "p")
&& has_single_content_with_tag($p1, "dl"))
{ my $dl1 = $ {$p1->content}[0];
# In this loop, rhs, not lhs, of < comparison changes,
# because we are removing elements from the content of $he.
while ($i < scalar(@{$ref_content})-1)
{ my $p2 = $ {$ref_content}[$i+1];
if (!((ref $p2) && ($p2->tag eq "p")
&& has_single_content_with_tag($p2, "dl")))
{ last; }
# Merge these two elements.
splice(@{$ref_content}, $i+1, 1); # remove $p2
my $dl2 = $ {$p2->content}[0];
$dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
}
# extra increment because next element isn't a candidate for $p1
$i++; }
$i++; }
return 1;
}
###########################################################################
### Testing
###
sub test ( $$ )
{ my ($action, $file) = check_args(2, @_);
# General testing
if (($action eq "view") || ($action eq ""))
{ # # $file = "$HOME/www/links.html";
# # $file = "$HOME/www/index.html";
# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
my $tree = file_to_tree($file);
## Testing
# print STDERR $tree->as_HTML;
$tree->dump();
# print STDERR $tree->tag(), "\n";
# print STDERR @{$tree->content()}, "\n";
#
# for (@{ $tree->extract_links(qw(a img)) }) {
# my ($link, $linkelem) = @$_;
# print STDERR "$link ", $linkelem->as_HTML;
# }
#
# print STDERR @{$tree->extract_links()}, "\n";
# my @top_level_elts = @{$tree->content()};
# if scalar(@{$tree->content()})
return;
}
elsif ($action eq "raw")
{ my $tree = new HTML::TreeBuilder;
$tree->ignore_unknown(1);
# $tree->warn(1);
$tree->parse_file($file);
$tree->dump();
# cleanup_parse_tree($tree);
# $tree->dump();
return;
}
# Test dealing with a section.
elsif ($action eq "section")
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
process_section_file($file, 0, "Title");
}
# Test dealing with many sections
elsif (0)
{ my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
"/homes/fish/mernst/tmp/python-doc/html/api/api.html",
"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/front.html",
"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
"/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
"/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
"/homes/fish/mernst/tmp/python-doc/html/api/index.html",
"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
"/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
"/homes/fish/mernst/tmp/python-doc/html/api/number.html",
"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/object.html",
"/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/os.html",
"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
"/homes/fish/mernst/tmp/python-doc/html/api/types.html",
"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
for my $file (@files)
{ print STDERR "\n", "=" x 75, "\n", "$file:\n";
process_section_file($file, 0, "Title");
}
}
# Test dealing with index.
elsif ($action eq "index")
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
process_index_file($file, "\@cindex");
print_index_info();
}
else
{ die "Unrecognized action `$action'"; }
}
###########################################################################
### Main loop
###
sub process_contents_file ( $ )
{ my ($file) = check_args(1, @_);
# could also use File::Basename
my $info_file = $file;
$info_file =~ s/(\/?index)?\.html$//;
if ($info_file eq "")
{ chomp($info_file = `pwd`); }
$info_file =~ s/^.*\///; # not the most efficient way to remove dirs
$html_directory = $file;
$html_directory =~ s/(\/|^)[^\/]+$/$1/;
my $texi_file = "$info_file.texi";
open(TEXI, ">$texi_file");
binmode TEXI, ":utf8";
print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
print TEXI "\@c %**start of header\n";
print TEXI "\@setfilename $info_file\n";
# 2. Summary Description and Copyright
# The "Summary Description and Copyright" segment describes the
# document and contains the copyright notice and copying permissions
# for the Info file. The segment must be enclosed between `@ifinfo'
# and `@end ifinfo' commands so that the formatters place it only in
# the Info file.
#
# The summary description and copyright segment does not appear in the
# printed document.
#
# @ifinfo
# This is a short example of a complete Texinfo file.
#
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end ifinfo
# 3. Title and Copyright
# The "Title and Copyright" segment contains the title and copyright
# pages and copying permissions for the printed manual. The segment
# must be enclosed between `@titlepage' and `@end titlepage'
# commands. The title and copyright page appear only in the printed
# manual.
#
# The titlepage segment does not appear in the Info file.
#
# @titlepage
# @sp 10
# @comment The title is printed in a large font.
# @center @titlefont{Sample Title}
#
# @c The following two commands start the copyright page.
# @page
# @vskip 0pt plus 1filll
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end titlepage
# 4. `Top' Node and Master Menu
# The "Master Menu" contains a complete menu of all the nodes in the
# whole Info file. It appears only in the Info file, in the `Top'
# node.
#
# The `Top' node contains the master menu for the Info file. Since a
# printed manual uses a table of contents rather than a menu, the master
# menu appears only in the Info file.
#
# @node Top, First Chapter, , (dir)
# @comment node-name, next, previous, up
#
# @menu
# * First Chapter:: The first chapter is the
# only chapter in this sample.
# * Concept Index:: This index has two entries.
# @end menu
$current_ref_tdf = [ "Top", 0, $ARGV[0] ];
process_section_file($file, 0, "Top");
while (scalar(@contents_list))
{ $current_ref_tdf = shift @contents_list;
process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
}
print TEXI "\n";
for my $indextitle (@index_titles)
{ print TEXI "\@node $indextitle\n";
print TEXI "\@unnumbered $indextitle\n";
print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
print TEXI "\n"; }
print TEXI "\@contents\n";
print TEXI "\@bye\n";
close(TEXI);
}
# This needs to be last so global variable initializations are reached.
if (scalar(@ARGV) == 0)
{ die "No arguments supplied to html2texi.pl"; }
if ($ARGV[0] eq "-test")
{ my @test_args = @ARGV[1..$#ARGV];
if (scalar(@test_args) == 0)
{ test("", "index.html"); }
elsif (scalar(@test_args) == 1)
{ test("", $test_args[0]); }
elsif (scalar(@test_args) == 2)
{ test($test_args[0], $test_args[1]); }
else
{ die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
exit();
}
if (scalar(@ARGV) != 1)
{ die "Pass one argument, the main/contents page"; }
process_contents_file($ARGV[0]);
# end of html2texi.pl