# Sgml.pm: Class Used for Objects to Manage SGML Data

package RDA::Driver::Sgml;

# $Id: Sgml.pm,v 1.19 2015/07/31 12:57:28 RDA Exp $
# ARCS: $Header: /home/cvs/cvs/RDA_8/src/scripting/lib/RDA/Driver/Sgml.pm,v 1.19 2015/07/31 12:57:28 RDA Exp $
#
# Change History
# 20150731  MSC  Extend the character encoding.

=head1 NAME

RDA::Driver::Sgml - Class Used for Objects to Manage SGML Data

=head1 SYNOPSIS

 require RDA::Driver::Sgml;

=head1 DESCRIPTION

The objects of the C<RDA::Driver::Sgml> class are used to manage SGML data. It
regroups the methods common to the C<RDA::Object::Html> and C<RDA::Object::Xml>
classes.

The following methods are available:

=cut

use strict;

BEGIN
{ use Exporter;
  use IO::File;
  use RDA::Text qw(debug get_string);
  use RDA::Object::View;
}

# Define the global public variables
use vars qw($STRINGS $VERSION @EXPORT_OK @ISA);
$VERSION   = sprintf('%d.%02d', q$Revision: 1.19 $ =~ /(\d+)\.(\d+)/);
@EXPORT_OK = qw(convert decode encode);
@ISA       = qw(Exporter);

# Define the global private constants

# Define the global private variables

# Report the package version
sub Version
{ return $VERSION;
}

# Define common entities
## no critic (Zero)
my %tb_ent = (
  # Normal characters that have a special meaning in SGML context
  'quot'  => 042, # double quote
  'amp'   => 046, # ampersand
  'apos'  => 047, # single quote
  'lt'    => 074, # less than
  'gt'    => 076, # greater than

  # ISO 8859-1 characters
  'nbsp'   => 0240, # non breaking space
  'iexcl'  => 0241, # inverted exclamation point
  'cent'   => 0242, # Cent sign
  'pound'  => 0243, # Pound sign
  'curren' => 0244, # currency sign
  'yen'    => 0245, # Yen sign
  'brvbar' => 0246,
  'sect'   => 0247, # section sign
  'uml'    => 0250,
  'copy'   => 0251, # copyright sign
  'ordf'   => 0252,
  'laquo'  => 0253,
  'not'    => 0254, # not sign
  'shy'    => 0255,
  'reg'    => 0256, # registered sign
  'macr'   => 0257, # macron sign
  'deg'    => 0260, # degre sign
  'plusmn' => 0261, # plus/minus sign
  'sup2'   => 0262, # superscript 2
  'sup3'   => 0263, # superscript 3
  'acute'  => 0264, # acute sign
  'micro'  => 0265,
  'para'   => 0266, # paragraph sign
  'middot' => 0267, # mid dot
  'cedil'  => 0270, # cedilla sign
  'sup1'   => 0271, # superscript 1
  'ordm'   => 0272,
  'raquo'  => 0273,
  'frac14' => 0274, # fraction 1/4
  'frac12' => 0275, # fraction 1/2
  'frac34' => 0276, # fraction 3/4
  'iquest' => 0277, # inverted question mark
  'Agrave' => 0300, # uppercase A, grave accent
  'Aacute' => 0301, # uppercase A, acute accent
  'Acirc'  => 0302, # uppercase A, circumflex accent
  'Atilde' => 0303, # uppercase A, tilde
  'Auml'   => 0304, # uppercase A, dieresis or umlaut mark
  'Aring'  => 0305, # uppercase A, ring
  'AElig'  => 0306, # uppercase AE diphthong (ligature)
  'Ccedil' => 0307, # uppercase C, cedilla
  'Egrave' => 0310, # uppercase E, grave accent
  'Eacute' => 0311, # uppercase E, acute accent
  'Ecirc'  => 0312, # uppercase E, circumflex accent
  'Euml'   => 0313, # uppercase E, dieresis or umlaut mark
  'Igrave' => 0314, # uppercase I, grave accent
  'Iacute' => 0315, # uppercase I, acute accent
  'Icirc'  => 0316, # uppercase I, circumflex accent
  'Iuml'   => 0317, # uppercase I, dieresis or umlaut mark
  'ETH'    => 0320, # uppercase Eth, Icelandic
  'Ntilde' => 0321, # uppercase N, tilde
  'Ograve' => 0322, # uppercase O, grave accent
  'Oacute' => 0323, # uppercase O, acute accent
  'Ocirc'  => 0324, # uppercase O, circumflex accent
  'Otilde' => 0325, # uppercase O, tilde
  'Ouml'   => 0326, # uppercase O, dieresis or umlaut mark
  'times'  => 0327, # times sign
  'Oslash' => 0330, # uppercase O, slash
  'Ugrave' => 0331, # uppercase U, grave accent
  'Uacute' => 0332, # uppercase U, acute accent
  'Ucirc'  => 0333, # uppercase U, circumflex accent
  'Uuml'   => 0334, # uppercase U, dieresis or umlaut mark
  'Yacute' => 0335, # uppercase Y, acute accent
  'THORN'  => 0336, # uppercase THORN, Icelandic
  'szlig'  => 0337, # lowercase sharp s, German (sz ligature)
  'agrave' => 0340, # lowercase a, grave accent
  'aacute' => 0341, # lowercase a, acute accent
  'acirc'  => 0342, # lowercase a, circumflex accent
  'atilde' => 0343, # lowercase a, tilde
  'auml'   => 0344, # lowercase a, dieresis or umlaut mark
  'aring'  => 0345, # lowercase a, ring
  'aelig'  => 0346, # lowercase ae diphthong (ligature)
  'ccedil' => 0347, # lowercase c, cedilla
  'egrave' => 0350, # lowercase e, grave accent
  'eacute' => 0351, # lowercase e, acute accent
  'ecirc'  => 0352, # lowercase e, circumflex accent
  'euml'   => 0353, # lowercase e, dieresis or umlaut mark
  'igrave' => 0354, # lowercase i, grave accent
  'iacute' => 0355, # lowercase i, acute accent
  'icirc'  => 0356, # lowercase i, circumflex accent
  'iuml'   => 0357, # lowercase i, dieresis or umlaut mark
  'eth'    => 0360, # lowercase eth, Icelandic
  'ntilde' => 0361, # lowercase n, tilde
  'ograve' => 0362, # lowercase o, grave accent
  'oacute' => 0363, # lowercase o, acute accent
  'ocirc'  => 0364, # lowercase o, circumflex accent
  'otilde' => 0365, # lowercase o, tilde
  'ouml'   => 0366, # lowercase o, dieresis or umlaut mark
  'divide' => 0367, # divide sign
  'oslash' => 0370, # lowercase o, slash
  'ugrave' => 0371, # lowercase u, grave accent
  'uacute' => 0372, # lowercase u, acute accent
  'ucirc'  => 0373, # lowercase u, circumflex accent
  'uuml'   => 0374, # lowercase u, dieresis or umlaut mark
  'yacute' => 0375, # lowercase y, acute accent
  'thorn'  => 0376, # lowercase thorn, Icelandic
  'yuml'   => 0377, # lowercase y, dieresis or umlaut mark
);
## use critic

# Define the reverse mapping
my @tb_ent;
foreach my $val (0 .. 31)
{ $tb_ent[$val] = sprintf('&#x%X;', $val);
}
foreach my $val (32 .. 126)
{ $tb_ent[$val] = chr($val);
}
foreach my $val (127 .. 159)
{ $tb_ent[$val] = sprintf('&#x%X;', $val);
}
while (my ($key, $val) = each(%tb_ent))
{ $tb_ent[$val] = "&$key;";
}
foreach my $val (37, 39, 40, 41, 42, 91, 93, 96, 123, 124, 125)
{ $tb_ent[$val] = sprintf('&#x%X;', $val);
}

# Add additional entities

=head2 S<$h = RDA::Driver::Sgml-E<gt>new($type[,$prefix[,$level]])>

The object constructor.

C<RDA::Driver::Sgml> is represented by a blessed hash reference. The
following special keys are used:

=over 12

=item S<    B<'-bak'> > Item backup stack

=item S<    B<'-buf'> > Input buffer

=item S<    B<'-crf'> > Trailing carriage return indicator

=item S<    B<'-cur'> > Current parent object

=item S<    B<'-dat'> > Data information

=item S<    B<'-det'> > Child object array

=item S<    B<'-err'> > Error count

=item S<    B<'-flg'> > Item indicator

=item S<    B<'-flt'> > Type filter hash

=item S<    B<'-lst'> > Last item reference

=item S<    B<'-lvl'> > Trace level

=item S<    B<'-nam'> > Item name

=item S<    B<'-pre'> > Trace prefix

=item S<    B<'-sta'> > Exit status of the last command

=item S<    B<'-stk'> > Item stack

=item S<    B<'-txt'> > Text normalization indicator

=item S<    B<'-typ'> > Item type

=back

Possible item types are as follows:

=for stopwords CData

=over 12

=item S<    B<'B' > > Conditional section           (XML)

=item S<    B<'C' > > CData                         (XML)

=item S<    B<'D' > > Declaration             (HTML, XML)

=item S<    B<'E' > > Entity                        (XML)

=item S<    B<'H' > > HTML root item          (HTML)

=item S<    B<'P' > > Processing instruction        (XML)

=item S<    B<'R' > > Remark, Comment         (HTML, XML)

=item S<    B<'S' > > String, Text fragment   (HTML, XML)

=item S<    B<'T' > > Element tag             (HTML, XML)

=item S<    B<'X' > > XML root item                 (XML)

=back

=cut

sub new
{ my ($cls, $typ, $pre, $dbg) = @_;

  # Create the object
  my $slf = bless {
    -buf => q{},
    -dat => q{},
    -err => 0,
    -flt => {},
    -lvl => 0,
    -pre => $pre || 'TRC> ',
    -stk => [],
    -txt => 1,
    -typ => $typ,
    }, $cls;

  # Perform extra initialization
  $slf->set_trace($dbg);
  $slf->{'-cur'} = $slf->{'-lst'} = $slf;

  # Return the object reference
  return $slf;
}

=head2 S<$h-E<gt>disable($flt)>

This method indicates the list of child types to ignore. When the list is
empty, it disables any type filtering. It returns the parser object reference.

=cut

sub disable
{ my ($slf, $flt) = @_;

  # Update the type filter list
  $slf->{'-flt'} = {};
  if ($flt)
  { for (split(/ */, $flt))
    {$slf->{'-flt'}->{uc($_)} = 1;
    }
  }

  # Return the object reference
  return $slf;
}

=head1 PARSING METHODS

=head2 S<$h-E<gt>add_item($typ[,...])>

This method adds an item to the item tree.

=cut

sub add_item
{ my ($top, $typ, @arg) = @_;
  my ($key, $slf, $val);

  # Check if the type must be ignored
  return if exists($top->{'-flt'}->{$typ});

  # Insert pending text
  $top->add_text;

  # Create the object
  debug($top->{'-pre'}.qq{** Add item $typ(}.join(q{,}, @arg).q{)})
    if $top->{'-lvl'};
  $slf = bless { -typ => $typ, -txt => $top->{'-cur'}->{'-txt'} }, ref($top);

  # Add the initial attributes
  while (($key, $val) = splice(@arg, 0, 2))
  { $val = $key unless defined($val);
    $slf->{$key} = $val;
  }

  # Link elements
  push(@{$top->{'-cur'}->{'-det'}}, $slf);
  $top->{'-lst'} = $slf;

  # Return the object reference
  return $slf;
}

=head2 S<$h-E<gt>add_text($text)>

This method adds a text item to the item tree.

=cut

sub add_text
{ my ($slf, $txt) = @_;

  if (defined($txt))
  { $slf->{'-dat'} .= $txt;
    debug($slf->{'-pre'}.q{++ Add text fragment }._fmt_str($txt))
      if $slf->{'-lvl'};
  }
  elsif (length($txt = $slf->{'-dat'}))
  { # Normalize the string
    if ($slf->{'-cur'}->{'-txt'} > 0)
    { $txt =~ s/\n/ /g;
      $txt =~ s/^\s+//g;
      $txt =~ s/\s+/ /g;
    }
    elsif ($slf->{'-cur'}->{'-txt'} < 0)
    { $txt =~ s/\s+$//;
      $txt =~ s/\n+$//;
      $txt =~ s/^\s*\n+$//;
    }

    # Add a text element
    $slf->{'-dat'} = q{};
    $slf->add_item('S', -dat => $slf->decode($txt)) if length($txt);
  }
  return;
}

=head2 S<$h-E<gt>debug_buffer($msg)>

This method displays a debug message and the first characters from the parser
buffer.

=cut

sub debug_buffer
{ my ($slf, $msg) = @_;

  debug("$msg: "._fmt_str(substr($slf->{'-buf'}, 0, 32).q{...}));
  return;
}

=head2 S<$h-E<gt>fmt_str($str)>

This method formats a string to be included in debug messages or dumps.

=cut

sub fmt_str
{ my ($slf, $str) = @_;

  return _fmt_str($str);
}

sub _fmt_str
{ my ($str) = @_;

  $str =~ s{\\}{\\\\}g;
  $str =~ s{'}{\\'}g;
  $str =~ s{\f}{\\f}g;
  $str =~ s{\n}{\\n}g;
  $str =~ s{\r}{\\r}g;
  $str =~ s{\t}{\\t}g;
  return q{'}.$str.q{'};
}

=head2 S<$h-E<gt>pop_item>

This method pops an item from the parsing stack.

=cut

sub pop_item
{ my ($slf) = @_;

  return $slf->{'-cur'} = $slf->{'-lst'} = pop(@{$slf->{'-stk'}}) || $slf;
}

=head2 S<$h-E<gt>push_item>

This method pushes an item in the parsing stack.

=cut

sub push_item
{ my ($slf) = @_;

  push(@{$slf->{'-stk'}}, $slf->{'-cur'});
  return $slf->{'-cur'} = $slf->{'-lst'};
}

=head2 S<$h-E<gt>restore_stack>

This method restores the parsing stack backup.

=cut

sub restore_stack
{ my ($slf) = @_;

  if (exists($slf->{'-bak'}))
  { $slf->{'-stk'} = $slf->{'-bak'};
    $slf->{'-lst'} = pop(@{$slf->{'-stk'}});
    $slf->{'-cur'} = pop(@{$slf->{'-stk'}});
  }
  return;
}

=head2 S<$h-E<gt>save_stack>

This method takes a backup of the parsing stack.

=cut

sub save_stack
{ my ($slf) = @_;

  return $slf->{'-bak'} = [ @{$slf->{'-stk'}}, $slf->{'-cur'}, $slf->{'-lst'} ];
}

=head1 TREE METHODS

=head2 S<$h-E<gt>as_string([$flg])>

This method returns the object tree as a string. When the flag is set, the
conditional sections appear as though other nodes and conditions are not
resolved.

=cut

sub as_string
{ my $slf = shift;
  my $flg = shift;
  my $lvl = shift || 0;
  my $buf = q{};

  $slf->_traverse(0, $flg, \&_as_string, \$buf);
  return $buf;
}

sub _as_string
{ my ($slf, $lvl, $flg, $buf) = @_;

  $slf->_dump_item($buf, q{}, $lvl) if $flg;
  return 1;
}

=head2 S<$h-E<gt>dump([$level[,$text[,$trace]]])>

This method dumps the item tree. It returns a reference to the parser object.

=cut

sub dump  ## no critic (Builtin)
{ my ($slf, $lvl, $pre, $trc) = @_;
  my ($buf);

  $buf = q{};
  $lvl = 0           unless defined($lvl);
  $pre = q{  } x $lvl unless defined($pre);
  if ($trc)
  { $slf->_dump_item(\$buf, $pre, $lvl);
  }
  else
  { $slf->_traverse(0, 1, \&_dump, \$buf, $pre);
  }
  return $buf;
}

sub _dump
{ my ($slf, $lvl, $flg, $buf, $pre) = @_;

  $slf->_dump_item($buf, $pre, $lvl) if $flg;
  return 1;
}

sub _dump_item
{ my ($slf, $buf, $pre, $lvl) = @_;

  $$buf .= qq{\n} if $$buf;
  $$buf .= $pre;
  $$buf .= q{..} x $lvl;
  $$buf .= q{[}.$slf->{'-typ'}.q{]};
  $$buf .= q{ }.$slf->{'-nam'} if exists($slf->{'-nam'});
  $$buf .= q{ }._fmt_str($slf->{'-dat'})
    if index('HX', $slf->{'-typ'}) < 0 && exists($slf->{'-dat'});
  for (sort keys(%{$slf}))
  { $$buf .= " $_="._fmt_str($slf->{$_}) unless m/^-/;
  }
  return;
}

=head2 S<$h-E<gt>exists($attr)>

This function indicates whether the attribute exists in the specified node.

=cut

sub exists  ## no critic (Builtin)
{ my ($slf, $att) = @_;

  return exists($slf->{$att});
}

=head2 S<$h-E<gt>find($qry)>

This method finds nodes corresponding to the specified criteria. It returns a
list of XML object references. The criteria can be composed of the following:

=over 2

=item *

C<.../> or C</.../>, which indicate that the following tag must be searched
recursively in all child nodes.

=item *

C</>, which indicates that the following tag should only be searched in the
child node.

=item *

C<|.../>, which indicates that the following tag must be searched recursively
in all child nodes. Moreover, the following criteria are used only to reduce
the current result set.

=item *

C<|>, which indicates that the following tag should only be searched in the
child node. Moreover, the following criteria are used only to reduce the
current result set.

=item *

The tag to find. It supports a regular expression when placed between single or
double quotation marks.

=item *

An optional list of attributes that the element must match also. They are
specified as a list of attribute specifications, each composed of the attribute
name, an C<=> sign, and a regular expression placed between single or double
quotation marks.

=item *

An optional condition on the associated data must match. It is composed of
C<*=> followed by the regular expression placed between simple or double
quotation marks.

=item *

An optional position constraint to select a single element from that hit
list. It is specified as an offset placed between square brackets. The first
list element corresponds to a zero offset. A number less than zero indicates
an offset from the end of the list.

=back

For example,

  .../tag1/tag2 attr1='re1' attr2="re2" [-1]|.../tag3 *="re3"

=cut

sub find
{ my ($slf, $qry) = @_;
  my ($cur, $off, $tbl, @prv, @qry, @sel, @tbl);

  # Decode the query string
  $tbl = \@qry;
  while (length($qry))
  { if ($qry =~ s{^\/*\.\.\.\/+}{})
    { push(@{$tbl}, $cur = [ 1, \&_find_skip, q{?}, {} ]);
    }
    elsif ($qry =~ s{^\|+\/*\.\.\.\/+}{})
    { $tbl = \@sel;
      push(@{$tbl}, $cur = [ 1, \&_find_skip, q{?}, {} ]);
    }
    elsif ($qry =~ s{^\/+}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]);
    }
    elsif ($qry =~ s{^\|+\/*}{})
    { $tbl = \@sel;
      push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]);
    }
    elsif ($qry =~ s{^([A-Za-z:_][A-Za-z0-9:_\.\-]*)=([\042\047])(.*?)\2\s*}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]) unless $cur;
      $cur->[3]->{$1} = RDA::Object::View->is_match($3, 1);
    }
    elsif ($qry =~ s{^([A-Za-z:_][A-Za-z0-9:_\.\-]*)\s*}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]) unless $cur;
      $cur->[1] = \&_find_eq;
      $cur->[2] = $1;
    }
    elsif ($qry =~ s{^([\042\047])(.*?)\1\s*}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]) unless $cur;
      $cur->[1] = \&_find_match;
      $cur->[2] = RDA::Object::View->is_match($2, 1);
    }
    elsif ($qry =~ s{^\*=([\042\047])(.*?)\1\s*}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]) unless $cur;
      $cur->[4] = RDA::Object::View->is_match($2, 1);
    }
    elsif ($qry =~ s{^\[(\-?\d+)\]\s*}{})
    { push(@{$tbl}, $cur = [ 0, \&_find_skip, q{?}, {} ]) unless $cur;
      $cur->[5] = $1;
    }
    else
    { return ();
    }
  }

  # Apply the search criteria
  @tbl = _search(\@qry, $slf);

  # Apply the restrictions
  @tbl = grep {scalar _search(\@sel, $_)} @tbl if @sel;

  # Return the objects found
  return @tbl;
}

sub _find
{ my ($slf, $lvl, $flg, $tbl, $cur) = @_;

  # Skip root item and scan next element at same level
  return 1 unless $flg && $lvl;

  # Analyse the node
  return $cur->[0] unless &{$cur->[1]}($slf, $cur->[2]);
  foreach my $key (keys(%{$cur->[3]}))
  { return $cur->[0]
      unless exists($slf->{$key}) && $slf->{$key} =~ $cur->[3]->{$key};
  }
  push(@{$tbl}, $slf)
    unless defined($cur->[4])  ## no critic (Unless)
        && $slf->get_data !~ $cur->[4];
  return $cur->[0];
}

sub _find_eq
{ my ($slf, $nam) = @_;

  return $slf->{'-typ'} eq 'T'
    && exists($slf->{'-nam'}) && $slf->{'-nam'} eq $nam;
}

sub _find_match
{ my ($slf, $pat) = @_;

  return $slf->{'-typ'} eq 'T'
    && exists($slf->{'-nam'}) && $slf->{'-nam'} =~ $pat;
}

sub _find_skip
{ return 0;
}

sub _search
{ my ($tbl, @tbl) = @_;
  my ($off, @prv);

  foreach my $cur (@{$tbl})
  { @prv = @tbl;
    @tbl = ();
    foreach my $obj (@prv)
    { $obj->traverse(\&_find, \@tbl, $cur);
    }
    if (defined($off = $cur->[5]))
    { $off = $#tbl + $off + 1 if $off < 0;
      @tbl = ($off < 0 || $off > $#tbl) ? () : ($tbl[$off]);
    }
  }
  return @tbl;
}

=head2 S<$h-E<gt>get_attr>

This function returns the list of node attributes.

=cut

sub get_attr
{ my ($slf) = @_;
  my @tbl;

  @tbl = grep {m/^[^-]/} keys(%{$slf});
  return (sort @tbl);
}

=head2 S<$h-E<gt>get_content([$flt[,cln]])>

This method returns the list of child nodes after resolving the conditions. The
second argument specifies the list of child types to consider. The third
argument specifies a regular expression to identify objects that must be
replaced by their content. By default, it returns all child nodes.

=cut

sub get_content
{ my ($slf, $flt, $cln) = @_;
  my @tbl;

  if (exists($slf->{'-det'}))
  { for (@{$slf->{'-det'}})
    { if ($_->{'-typ'} eq 'B')
      { push (@tbl, $_->get_content($flt, $cln)) if $_->{'-flg'};
      }
      elsif ($cln && exists($_->{'-nam'}) && $_->{'-nam'} =~ $cln)
      { push (@tbl, $_->get_content($flt, $cln));
      }
      elsif ($flt)
      { push(@tbl, $_)
          unless index($flt, $_->{'-typ'}) < 0;  ## no critic (Unless)
      }
      else
      { push(@tbl, $_);
      }
    }
  }
  return @tbl;
}

=head2 S<$h-E<gt>get_error>

This function returns the number of parsing errors.

=cut

sub get_error
{ return shift->{'-err'};
}

=head2 S<$h-E<gt>get_name([$dft])>

This method returns the object name or the default value when no name is
defined.

=cut

sub get_name
{ my ($slf, $dft) = @_;

  return exists($slf->{'-nam'}) ? $slf->{'-nam'} : $dft;
}

=head2 S<$h-E<gt>get_status>

This method returns the exit code of the last command parsed.

=cut

sub get_status
{ return shift->{'-sta'};
}

=head2 S<$h-E<gt>get_type>

This function returns the node type.

=cut

sub get_type
{ return shift->{'-typ'};
}

=head2 S<$h-E<gt>get_value($attr[,$dft])>

This function returns the value of the attribute in the specified node. When
the attribute is not defined, it returns the default value.

=cut

sub get_value
{ my ($slf, $att, $dft) = @_;

  return exists($slf->{$att}) ? $slf->{$att} : $dft;
}

=head2 S<$h-E<gt>map_content([$fct[,$arg...]])>

This method reconstructs the list of child nodes by applying the callback
function on each child node. It passes the node reference and the specified
arguments to the callback function.

It returns the object reference.

=cut

sub map_content
{ my ($slf, $fct, @arg) = @_;

  if (exists($slf->{'-det'}))
  { $slf->{'-det'} = [map {&{$fct}($_, @arg)} @{$slf->{'-det'}}];
  }
  return $slf;
}

=head2 S<$h-E<gt>parse_command($cmd)>

This method parses text directly from the output of a command.
It returns a reference to the SGML object.

=cut

sub parse_command
{ my ($slf, $cmd) = @_;
  my ($buf);

  $slf->{'-sta'} = q{};
  if (defined($cmd) && open(FIL, "$cmd |"))  ## no critic (Open,Handle)
  { while (sysread(FIL, $buf, 512))
    { $slf->parse($buf);
    }
    $slf->eof;
    close(FIL);
    $slf->{'-sta'} = $?;
  }
  return $slf;
}

=head2 S<$h-E<gt>parse_file($file)>

This method parses text directly from a file. It returns a reference to the
XML object.

=cut

sub parse_file
{ my ($slf, $fil) = @_;
  my ($buf, $ifh);

  if (defined($fil))
  { if (ref($fil) eq 'RDA::Object::Buffer')
    { $ifh = $fil->get_handle;
      $slf->parse($buf, $ifh) while $ifh->read($buf, 512);
      $slf->eof;
    }
    else
    { $ifh = IO::File->new;
      if ($ifh->open("<$fil"))
      { $slf->parse($buf, $ifh) while $ifh->read($buf, 512);
        $slf->eof;
        $ifh->close;
      }
    }
  }
  return $slf;
}

=head2 S<$h-E<gt>set_trace([$level])>

This method specifies the trace level and returns the previous trace level.

=cut

sub set_trace
{ my ($slf, $lvl) = @_;
  my ($old);

  $old = $slf->{'-lvl'};
  $slf->{'-lvl'} = ($lvl > 0) ? $lvl : 0 if defined($lvl);
  return $old;
}

=head2 S<$h-E<gt>traverse($fct[,...])>

This method explores the whole object tree recursively. It executes the
callback function on each node before and after examining child nodes. It
passes the following arguments to the callback function:

=over 2

=item *

A reference to the current node

=item *

Its indentation level

=item *

An indicator (true at node start, false at node end)

=item *

The extra arguments of the traverse function

=back

The tree exploration continues as long as the callback function returns a true
value. It includes or ignores the conditional sections based on condition
values.

=cut

sub traverse
{ return shift->_traverse(0, 0, @_);
}

sub _traverse
{ my ($slf, $lvl, $flg, $fct, @arg) = @_;

  # Treat a node
  if ($flg || $slf->{'-typ'} ne 'B')
  { if (&$fct($slf, $lvl, 1, @arg) && exists($slf->{'-det'}))
    { for (@{$slf->{'-det'}})
      { last unless $_->_traverse($lvl + 1, $flg, $fct, @arg);
      }
    }
    return &$fct($slf, $lvl, 0, @arg);
  }

  # Treat a conditional section
  if ($slf->{'-flg'} && exists($slf->{'-det'}))
  { for (@{$slf->{'-det'}})
    { return 0 unless $_->_traverse($lvl, $flg, $fct, @arg);
    }
  }
  return 1;
}

=head1 ENTITY METHODS

=head2 S<$h-E<gt>convert($str)> or S<RDA::Driver::Sgml::convert($str)>

This method converts entities found in the string with their numeric
equivalence. It ignores unrecognized entities.

=cut

sub convert
{ my $str = shift;
  $str = shift if ref($str);

  $str =~ s/(&(\w+);?)/
    exists($tb_ent{$2}) ? sprintf('&#%d;', $tb_ent{$2}) : $1/eg;
  $str =~ s/(\&+)([^#]|\z)/('&#38;' x length($1)).$2/eg;
  return $str;
}

=head2 S<$h-E<gt>decode($str)> or S<RDA::Driver::Sgml::decode($str)>

This method replaces entities found in the string with the corresponding ISO
8859-1 character. It ignores unrecognized entities.

=cut

sub decode
{ my $str = shift;
  $str = shift if ref($str);
  my $cod;

  $str =~ s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
  $str =~ s/(&\#[xX]([0-9a-fA-F]+);?)/$cod = hex($2);
                                      $cod < 256 ? chr($cod) : $1/eg;
  $str =~ s/(&(\w+);?)/exists($tb_ent{$2}) ? chr($tb_ent{$2}) : $1/eg;
  return $str;
}

=head2 S<$h-E<gt>encode($str,$flg)> or S<RDA::Driver::Sgml::encode($str,$flg)>

This method replaces control characters, ISO 8859-1 or UTF-8 characters,
special SGML characters, and other characters presenting a security exposure
with their entity representation. Unless the flag is set, it encodes Wiki
(C<%>, C<'>, C<*>, C<[>, C<]>, C<^>, C<`>, C<{>, C<}>, and C<|>) characters
also.

=cut

sub encode
{ my $str = shift;
  $str = shift if ref($str);
  my $flg = shift;
  my ($chr, $ord);

  ## no critic (Interpolation)
  $chr = $flg
    ? '\040\041\043-\045\047\052\054\056-\072\075\077-\176'
    : '\040\041\043\044\054\056-\072\075\077-\132\134\137\141-\172\176';
  $str =~ s/([^\n\r\t$chr])/
    $ord = ord($1);
    $ord < 256 ? $tb_ent[$ord] : sprintf('&#x%X;', $ord)/ge;
  return $str;
}

1;

__END__

=head1 SEE ALSO

L<RDA::Object::Buffer|RDA::Object::Buffer>,
L<RDA::Object::Html|RDA::Object::Html>,
L<RDA::Object::Parser|RDA::Object::Parser>,
L<RDA::Object::Request|RDA::Object::Request>,
L<RDA::Object::Response|RDA::Object::Response>,
L<RDA::Object::UsrAgent|RDA::Object::UsrAgent>,
L<RDA::Object::View|RDA::Object::View>,
L<RDA::Object::Xml|RDA::Object::Xml>,
L<RDA::Text|RDA::Text>

=head1 COPYRIGHT NOTICE

Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.

=head1 TRADEMARK NOTICE

Oracle and Java are registered trademarks of Oracle and/or its
affiliates. Other names may be trademarks of their respective owners.

=cut