afblue.pl

Branch :
Show log
Commit
Author : Werner Lemberg
Date : 2013-08-25 08:01:41
Hash : 8b8be783
Message : [autofit] Introduce data file for blue strings. The idea is to have a central file which gets processed by a Perl script to create proper `.c' and `.h' files using templates. There are two other reasons to do that: . The data file should be easily readable. We use UTF-8 encoding which then gets converted to single bytes. . Since the number of supported scripts will increase soon, the current usage of blue string arrays is a waste of space. Using the Perl script it is possible to imitate jagged arrays, defining enumeration constants as offsets into the arrays. This commit only adds files without changing any functionality. * src/autofit/afblue.dat: New data file. * src/tools/afblue.pl: New Perl script for processing `afblue.dat'. * src/autofit/afblue.cin, src/autofit/afblue.hin: New template files for... * src/autofit/afblue.c, src/autofit/afblue.c: New source files. To avoid a dependency on Perl, we add them too.
src/tools/afblue.pl
#! /usr/bin/perl -w
# -*- Perl -*-
#
# afblue.pl
#
# Process a blue zone character data file.
#
# Copyright 2013 by
# David Turner, Robert Wilhelm, and Werner Lemberg.
#
# This file is part of the FreeType project, and may only be used,
# modified, and distributed under the terms of the FreeType project
# license, LICENSE.TXT.  By continuing to use, modify, or distribute
# this file you indicate that you have read the license and
# understand and accept it fully.

use strict;
use warnings;
use English '-no_match_vars';
use open ':std', ':locale';


my $prog = $PROGRAM_NAME;
$prog =~ s| .* / ||x;      # Remove path.

die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0;


my $datafile = $ARGV[0];

my %diversions;        # The extracted and massaged data from `datafile'.
my @else_stack;        # Booleans to track else-clauses.
my @name_stack;        # Stack of integers used for names of aux. variables.

my $curr_enum;         # Name of the current enumeration.
my $curr_array;        # Name of the current array.
my $curr_max;          # Name of the current maximum value.

my $curr_enum_element; # Name of the current enumeration element.
my $curr_offset;       # The offset relative to current aux. variable.
my $curr_elem_size;    # The size of the current string or block.

my $have_sections = 0; # Boolean; set if start of a section has been seen.
my $have_strings;      # Boolean; set if current section contains strings.
my $have_blocks;       # Boolean; set if current section contains blocks.

my $have_enum_element; # Boolean; set if we have an enumeration element.
my $in_string;         # Boolean; set if a string has been parsed.

my $num_sections = 0;  # Number of sections seen so far.

my $last_aux;          # Name of last auxiliary variable.


# Regular expressions.

# [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n'
my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x;

# [<ws>] <enum_element_name> [<ws>] '\n'
my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x;

# '#' <preprocessor directive> '\n'
my $preprocessor_re = qr/ ^ \# /x;

# '/' '/' <comment> '\n'
my $comment_re = qr| ^ // |x;

# empty line
my $whitespace_only_re = qr/ ^ \s* $ /x;

# [<ws>] '"' <string> '"' [<ws>] '\n'  (<string> doesn't contain newlines)
my $string_re = qr/ ^ \s*
                       " ( (?: [^"\\]++ | \\. )*+ ) "
                       \s* $ /x;

# [<ws>] '{' <block> '}' [<ws>] '\n'  (<block> can contain newlines)
my $block_start_re = qr/ ^ \s* \{ /x;

# We need the capturing group for `split' to make it return the separator
# tokens (i.e., the opening and closing brace) also.
my $brace_re = qr/ ( [{}] ) /x;


sub Warn
{
  my $message = shift;
  warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n";
}


sub Die
{
  my $message = shift;
  die "$datafile:$INPUT_LINE_NUMBER: error: $message\n";
}


my $warned_before = 0;

sub warn_before
{
  Warn("data before first section gets ignored") unless $warned_before;
  $warned_before = 1;
}


sub strip_newline
{
  chomp;
  s/ \x0D $ //x;
}


sub end_curr_string
{
  # Append final null byte to string.
  if ($have_strings)
  {
    push @{$diversions{$curr_array}}, "    '\\0',\n" if $in_string;

    $curr_offset++;
    $in_string = 0;
  }
}


sub update_max_elem_size
{
  if ($curr_elem_size)
  {
    my $max = pop @{$diversions{$curr_max}};
    $max = $curr_elem_size if $curr_elem_size > $max;
    push @{$diversions{$curr_max}}, $max;
  }
}


sub convert_non_ascii_char
{
  # A UTF-8 character outside of the printable ASCII range, with possibly a
  # leading backslash character.
  my $s = shift;

  # Here we count characters, not bytes.
  $curr_elem_size += length $s;

  utf8::encode($s);
  $s = uc unpack 'H*', $s;

  $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg;

  return $s;
}


sub convert_ascii_chars
{
  # A series of ASCII characters in the printable range.
  my $s = shift;

  my $count = $s =~ s/\G(.)/'$1', /g;
  $curr_offset += $count;
  $curr_elem_size += $count;

  return $s;
}


sub convert_literal
{
  my $s = shift;
  my $orig = $s;

  # ASCII printables and space
  my $safe_re = '\x20-\x7E';
  # ASCII printables and space, no backslash
  my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E';

  $s =~ s{
           (?: \\? ( [^$safe_re] )
               | ( (?: [$safe_no_backslash_re]
                       | \\ [$safe_re] )+ ) )
         }
         {
           defined($1) ? convert_non_ascii_char($1)
                       : convert_ascii_chars($2)
         }egx;

   # We assume that `$orig' doesn't contain `*/'
   return $s . " /* $orig */";
}


sub aux_name
{
  return "af_blue_" . $num_sections. "_" . join('_', reverse @name_stack);
}


sub aux_name_next
{
  $name_stack[$#name_stack]++;
  my $name = aux_name();
  $name_stack[$#name_stack]--;

  return $name;
}


sub enum_val_string
{
  # Build string which holds code to save the current offset in an
  # enumeration element.
  my $aux = shift;

  my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" )
              ? ""
              : "$last_aux + ";

  return "    $aux = $add$curr_offset,\n";
}



# Process data file.

open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n";

while (<DATA>)
{
  strip_newline();

  next if /$comment_re/;
  next if /$whitespace_only_re/;

  if (/$section_re/)
  {
    Warn("previous section is empty") if ($have_sections
                                          && !$have_strings
                                          && !$have_blocks);

    end_curr_string();
    update_max_elem_size();

    # Save captured groups from `section_re'.
    $curr_enum = $1;
    $curr_array = $2;
    $curr_max = $3;

    $curr_enum_element = "";
    $curr_offset = 0;

    Warn("overwriting already defined enumeration \`$curr_enum'")
      if exists($diversions{$curr_enum});
    Warn("overwriting already defined array \`$curr_array'")
      if exists($diversions{$curr_array});
    Warn("overwriting already defined maximum value \`$curr_max'")
      if exists($diversions{$curr_max});

    $diversions{$curr_enum} = [];
    $diversions{$curr_array} = [];
    $diversions{$curr_max} = [];

    push @{$diversions{$curr_max}}, 0;

    @name_stack = ();
    push @name_stack, 0;

    $have_sections = 1;
    $have_strings = 0;
    $have_blocks = 0;

    $have_enum_element = 0;
    $in_string = 0;

    $num_sections++;
    $curr_elem_size = 0;

    $last_aux = aux_name();

    next;
  }

  if (/$preprocessor_re/)
  {
    if ($have_sections)
    {
      # Having preprocessor conditionals complicates the computation of
      # correct offset values.  We have to introduce auxiliary enumeration
      # elements with the name `af_blue_<s>_<n1>_<n2>_...' which store
      # offsets to be used in conditional clauses.  `<s>' is the number of
      # sections seen so far, `<n1>' is the number of `#if' and `#endif'
      # conditionals seen so far in the topmost level, `<n2>' the number of
      # `#if' and `#endif' conditionals seen so far one level deeper, etc.
      # As a consequence, uneven values are used within a clause, and even
      # values after a clause, since the C standard doesn't allow the
      # redefinition of an enumeration value.  For example, the name
      # `af_blue_5_1_6' is used to construct enumeration values in the fifth
      # section after the third (second-level) if-clause within the first
      # (top-level) if-clause.  After the first top-level clause has
      # finished, `af_blue_5_2' is used.  The current offset is then
      # relative to the value stored in the current auxiliary element.

      if (/ ^ \# \s* if /x)
      {
        push @else_stack, 0;

        $name_stack[$#name_stack]++;

        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
        $last_aux = aux_name();

        push @name_stack, 0;

        $curr_offset = 0;
      }
      elsif (/ ^ \# \s* elif /x)
      {
        Die("unbalanced #elif") unless @else_stack;

        pop @name_stack;

        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
        $last_aux = aux_name();

        push @name_stack, 0;

        $curr_offset = 0;
      }
      elsif (/ ^ \# \s* else /x)
      {
        my $prev_else = pop @else_stack;
        Die("unbalanced #else") unless defined($prev_else);
        Die("#else already seen") if $prev_else;
        push @else_stack, 1;

        pop @name_stack;

        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
        $last_aux = aux_name();

        push @name_stack, 0;

        $curr_offset = 0;
      }
      elsif (/ ^ \# \s* endif /x)
      {
        my $prev_else = pop @else_stack;
        Die("unbalanced #endif") unless defined($prev_else);

        pop @name_stack;
        $name_stack[$#name_stack]++;

        # If there is no else-clause for an if-clause, we add one.  This is
        # necessary to have correct offsets.
        if (!$prev_else)
        {
          push @{$diversions{$curr_enum}}, enum_val_string(aux_name())
                                           . "#else\n";

          $curr_offset = 0;
        }

        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
        $last_aux = aux_name();

        $curr_offset = 0;
      }

      # Handle (probably continued) preprocessor lines.
    CONTINUED_LOOP:
      {
        do
        {
          strip_newline();

          push @{$diversions{$curr_enum}}, $ARG . "\n";
          push @{$diversions{$curr_array}}, $ARG . "\n";

          last CONTINUED_LOOP unless / \\ $ /x;

        } while (<DATA>);
      }
    }
    else
    {
      warn_before();
    }

    next;
  }

  if (/$enum_element_re/)
  {
    end_curr_string();
    update_max_elem_size();

    $curr_enum_element = $1;
    $have_enum_element = 1;
    $curr_elem_size = 0;

    next;
  }

  if (/$string_re/)
  {
    if ($have_sections)
    {
      Die("strings and blocks can't be mixed in a section") if $have_blocks;

      # Save captured group from `string_re'.
      my $string = $1;

      if ($have_enum_element)
      {
        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
        $have_enum_element = 0;
      }

      $string = convert_literal($string);

      push @{$diversions{$curr_array}}, "    $string\n";

      $have_strings = 1;
      $in_string = 1;
    }
    else
    {
      warn_before();
    }

    next;
  }

  if (/$block_start_re/)
  {
    if ($have_sections)
    {
      Die("strings and blocks can't be mixed in a section") if $have_strings;

      my $depth = 0;
      my $block = "";
      my $block_end = 0;

      # Count braces while getting the block.
    BRACE_LOOP:
      {
        do
        {
          strip_newline();

          foreach my $substring (split(/$brace_re/))
          {
            if ($block_end)
            {
              Die("invalid data after last matching closing brace")
                if $substring !~ /$whitespace_only_re/;
            }

            $block .= $substring;

            if ($substring eq '{')
            {
              $depth++;
            }
            elsif ($substring eq '}')
            {
              $depth--;

              $block_end = 1 if $depth == 0;
            }
          }

          # If we are here, we have run out of substrings, so get next line
          # or exit.
          last BRACE_LOOP if $block_end;

          $block .= "\n";

        } while (<DATA>);
      }

      if ($have_enum_element)
      {
        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
        $have_enum_element = 0;
      }

      push @{$diversions{$curr_array}}, $block . ",\n";

      $curr_offset++;
      $curr_elem_size++;

      $have_blocks = 1;
    }
    else
    {
      warn_before();
    }

    next;
  }

  # Garbage.  We weren't able to parse the data.
  Die("syntax error");
}

# Finalize data.
end_curr_string();
update_max_elem_size();


# Filter stdin to stdout, replacing `@...@' templates.

sub emit_diversion
{
  my $diversion_name = shift;
  return (exists($diversions{$1})) ? "@{$diversions{$1}}"
                                   : "@" . $diversion_name . "@";
}


$LIST_SEPARATOR = '';

my $s1 = "This file has been generated by the Perl script \`$prog',";
my $s1len = length $s1;
my $s2 = "using data from file \`$datafile'.";
my $s2len = length $s2;
my $slen = ($s1len > $s2len) ? $s1len : $s2len;

print "/* " . $s1 . " " x ($slen - $s1len) . " */\n"
      . "/* " . $s2 . " " x ($slen - $s2len) . " */\n"
      . "\n";

while (<STDIN>)
{
  s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx;
  print;
}

# EOF
kc3-lang/freetype/src/tools/afblue.pl

Commit

kc3-lang/freetype /src/tools/afblue.pl