#! /usr/bin/perl -w
# xml2tex: a perl script to process XML documents and generate TeX code
# Copyright (C) 1999 Ed Cashin
#
# version 1.1
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
# 

use strict;
use XML::Parser;
use Time::localtime;
use Getopt::Long;

my ($help_wanted);
my $options	 = GetOptions("help" => \$help_wanted);
if ($help_wanted || ! @ARGV) {
    print "run perldoc $0 for usage\n";
}

my @files	 = @ARGV;
my $parser	 = new XML::Parser(ErrorContext	 => 2,
				   Handlers	 => {
				       'Init'	 => \&inithandler,
				       'Start'	 => \&starthandler,
				       'End'	 => \&endhandler,
				       'Char'	 => \&charhandler,
				       'Final'	 => \&finalhandler,
				   });

foreach my $filename (@files) {
    $parser->parsefile($filename);
}

sub inithandler {
    print <<'ENDORAMA';
\input eplain
\def\normalparskip{1 ex plus .5ex}
\def\itemskip{1 em}
\parskip=\normalparskip
\parindent=0in

\newdimen\tmphsize

\font\tenvtt=cmvtt10 at 10 truept
\font\twelvevtt=cmvtt10 at 12 truept

\font\tenrm=cmr10 at 10 truept

\font\tenbx=cmbx10 at 10 truept

\font\twelvessbx=cmssbx10 at 12 truept
\font\fifteenssbx=cmssbx10 at 15 truept

ENDORAMA
}

sub starthandler {
    my ($p, $el, @attrlist)	 = @_;
    if ($el =~ /\btitle\b/) {	&starttitle($p, $el);    }
    elsif ($el =~ /\bsubtitle\b/) {	&startsubtitle($p, $el);    }
    elsif ($el =~ /\bit\b/) {	&startitalic($p, $el);    }
    elsif ($el =~ /\bbf\b/) {	&startbold($p, $el);    }
    elsif ($el =~ /\bsect\b/) {	&startsect($p, $el);    }
    elsif ($el =~ /\bcode\b/) {	&startcode($p, $el);    }
    elsif ($el =~ /\bitemize\b/) {	&startitemize($p, $el);    }
    elsif ($el =~ /\bitem\b/) {	&startitem($p, $el, \@attrlist);    }

    #----------now the empty XML tags
    elsif ($el =~ /\btoday\b/) {	&fancy_today($p, $el);    }
    elsif ($el =~ /\bp\b/) {	&par($p, $el);    }
    elsif ($el =~ /\bhr\b/) {	&hrule($p, $el);    }
}

sub charhandler {
    my ($p, $data)	 = @_;
    print &tex_escaped($data);
}

sub tex_escaped {
    my $text	 = shift;
    study $text;
    #----------handle quotes
    while ($text =~ s/\"/``/ && $text =~ s/\"/''/) { }

    #----------escape special chars
    #-----first are extra-special characters: do \, {, and } first
    #-----so that our own TeX code doesn't get messed up
#	$text	 =~ s/\\/\$\\backslash\$/g;	# \
    $text	 =~ s/\\/dollabackaslashadolla/g;	
    # \ is a special case, since using
    # $\backslash$ would conflict with the
    # rule for escaping '$'
    $text	 =~ s/\$/\\\$/g;		# $
    $text	 =~ s/\{/\$\\lbrace\$/g;	# {
    $text	 =~ s/\}/\$\\rbrace\$/g;	# }
    
    $text	 =~ s/&/\\\&/g;			# &
    $text	 =~ s/\^/\\char\'136\\relax /g;	# ^
    $text	 =~ s/~/\\char\'176\\relax /g;	# ~
    $text	 =~ s/\|/\$|\$/g;		# |
    $text	 =~ s/_/\\\_/g;			# _
    $text	 =~ s/\#/\\char\'043\\relax /g;	# #
    $text	 =~ s/%/\\\%/g;			# %
    $text	 =~ s/</\$<\$/g;		# <
    $text	 =~ s/>/\$>\$/g;		# >
    #---------reset the backslash
    $text	 =~ s/dollabackaslashadolla/\$\\backslash\$/g;	

#old
#     #----------escape dollar signs before using them to get '<' and '>' 
#     $text	 =~ s/\$/\\\$/g;
#     #----------handle '<' and '>' 
#     $text	 =~ s/>/\$>\$/g;
#     $text	 =~ s/</\$<\$/g;
# 
#     $text	 =~ s/\%/\\\%/g;
#     $text	 =~ s/\_/\\\_/g;
#     $text	 =~ s/\&/\\\&/g;
    return $text;
}

sub endhandler {
    my ($p, $el)	 = @_;
    if ($el =~ /\bhead\b/) {	&endhead($p, $el);    }
    elsif ($el =~ /\btitle\b/) {	&endtitle($p, $el);    }
    elsif ($el =~ /\bsubtitle\b/) {	&endsubtitle($p, $el);    }
    elsif ($el =~ /\bit\b/) {	&enditalic($p, $el);    }
    elsif ($el =~ /\bbf\b/) {	&endbold($p, $el);    }
    elsif ($el =~ /\bsect\b/) {	&endsect($p, $el);    }
    elsif ($el =~ /\bcode\b/) {	&endcode($p, $el);    }
    elsif ($el =~ /\bitemize\b/) {	&enditemize($p, $el);    }
    elsif ($el =~ /\bitem\b/) {	&enditem($p, $el);    }
}
sub finalhandler {
    print '\bye' . "\n";
}

sub starttitle {    print "\\hfil{\\fifteenssbx "; }
sub endtitle {    print "}\\par"; }

sub startsubtitle {    print "\\hfil{\\twelvevtt "; }
sub endsubtitle {    print "}\\par"; }

sub startitalic {    print "{\\tenit "; }
sub enditalic {    print "\\/}"; }

sub startbold {    print "{\\tenbx "; }
sub endbold {    print "\\/}"; }

sub startsect {    print "\n\\bigbreak\n{\\twelvessbx "; }
sub endsect {    print "}\\par\n"; }

sub startitemize {
    print "\n\\tmphsize=\\hsize\\advance\\tmphsize by -\\itemskip"
      . "\t\% itemize\n";
}
sub enditemize {
    print "\\advance\\tmphsize by \\itemskip\t\% end itemize\n";
}

sub startitem {
    my $attrsR;
    (undef, undef, $attrsR)	 = @_;
    my $itemname;

    while (my ($attr, $val) = splice @$attrsR, 0, 2) {
	if (defined $val and $attr eq "name") {
	    $itemname	 = $val;
	}
    }
    if (! defined $itemname) {
	$itemname	 = "";
    } else {
	$itemname	 = &tex_escaped($itemname);
    }
    print "\\medskip\\hskip\\itemskip\\vbox{\\hsize\\tmphsize\%\n";
    print "    {\\tenbx $itemname}\\quad\n";
}
sub enditem {
    print "}\n";
}

sub startcode {
    print <<'ENDORAMA';
\begingroup\tenvtt
\tmphsize=\hsize
\advance\tmphsize by -4em
\null\hskip 2em
\obeylines\obeywhitespace\vbox{\hsize\tmphsize
ENDORAMA
}    
sub endcode {
    print <<'ENDORAMA';
}
\endgroup

ENDORAMA
}    

sub endhead {
    print <<'ENDORAMA';
\par\vrule height 1pt depth 0pt width 5in\par
\hsize 6in			% narrow the right margin
\tenrm
\bigskip

ENDORAMA
}
sub fancy_today {
    my @months	 = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
    print localtime->mday . " " . $months[localtime->mon] . ", "
      . (localtime->year + 1900);
}
sub par {
    print "\\par ";
}
sub hrule {
    print "\\bigbreak\\null\\vskip 1ex\\hrule\n";
}

__END__

=head1 NAME

xml2tex - Converts XML documentation into TeX code, suitable for processing with TeX or pdfTeX.

=head1 SYNOPSIS

    kali$ xml2tex mydoc.xml > mydoc.tex
    kali$ pdftex mydoc.tex && acroread mydoc.pdf
    kali$ tex mydoc.tex && dvips -o mydoc.ps mydoc

=head1 ABSTRACT

This program understands a small but useful number of XML tags.  It can generate TeX formatting code based on the structural tags.  This TeX code may be processed by pdfTeX to produce pdf-format files or by TeX to produce dvi (and postscript) files.

=head2 NOTE

You must supply a starting and ending tag around your whole document after the xml version tag:

    <?xml version="1.0"?>
    <doc>
    Hi there!
    </doc>

XML is case sensitive.

=head1 SUPPORTED TAGS

=item HEAD 

the <head></head> tag encloses a header for the document: currently only a title and subtitles.

=item TITLE

the <title></title> tag encloses the document's title

=item SUBTITLE

<subtitle></subtitle> 

=item ITALICS

<it></it> 

=item BOLD FACE

<bf></bf> 

=item PREFORMATTED CODE

<code></code> 

The code tag generates TeX code that will produce a monospaced font and will respect the formatting you provide with whitespace--but NOT FOR TABS!  Tabs don't work.  They look like spaces in the output.  For example,

    <code>
    # this is a comment
    sub foo {
    	my $bar = "open";
    } 
    </code>

=head1 more supported tags: EMPTY-ELEMENT TAGS

When the tag doesn\'t enclose something it\'s called an empty-element tag.  You have to put a slash _after_ the tag like this:

It is <today/>.

=item PARAGRAPH

<p/> 
Shows that this is the beginning of a new paragraph.

=item TODAY

<today/>
Inserts a string for today\'s date, e.g., "21 May, 1999"

=head1 BUGS

It\'s not really a bug, but a lot of the kinds of things you\'d want in documentation turn out to be enough like XML code to mess up the parser.  Things that look like "&nbsp;", for example.  You can use XML\'s CDATA directive to quote weird stuff.  See http://www.w3.org/XML/

I\'m sure there are bugs.  Please let me know about them.  
