/* ------------------------------------------------------------------------
@NAME       : bibtex.g
@DESCRIPTION: PCCTS-based lexer and parser for BibTeX files.  (Or rather,
              for the BibTeX data description language.  This parser
              enforces nothing about the structure and contents of
              entries; that's up to higher-level processors.  Thus, there's
              nothing either particularly bibliographic or TeXish about
              the language accepted by this parser, except for the affinity
              for curly braces.)

              There are a few minor differences from the language accepted
              by BibTeX itself, but these are generally improvements over
              BibTeX's behaviour.  See the comments in the grammar, at least
              until I write a decent description of the language.

              I have used BibTool (yet another BibTeX parser, along with a
              prettyprinter and specialized language for a common set of
              bibhacks) as another check of correctness -- there are a few
              screwball things that BibTeX accepts and BibTool doesn't, so
              I felt justified in rejecting them as well.  In general, this
              parser is a little stricter than BibTeX, but a little looser
              than BibTool.  YMMV.

              The product of the parser is an abstract syntax tree that
              can be traversed to be printed in a simple form (see 
              print_entry() in bibparse.c) or perhaps transformed to
              a format more convenient for higher-level languages.

              Whole files may be parsed by entering the parser at `bibfile';
              in this case, the parser really returns a forest (list of
              ASTs, one per entry).  Alternately, you can enter the parser
              at `entry', which reads and parses a single entry.  Note that
              this has the unfortunate side effect of devouring the '@'
              character that starts the next entry, necessitating a hack
              along the lines of read_entry() (also in bibparse.c).
@GLOBALS    : the usual DLG and ANTLR cruft
@CALLS      : 
@CREATED    : first attempt: May 1996, Greg Ward
              second attempt (complete rewrite): July 25-28 1996, Greg Ward
@MODIFIED   : Sep 1996, GPW: changed to generate an AST rather than print
                             out each entry as it's encountered
              Jan 1997, GPW: redid the above, because it was lost when
                             my !%&$#!@ computer was stolen
@VERSION    : $Id: bibtex.g,v 1.8 1997/02/28 03:57:56 greg Exp $
-------------------------------------------------------------------------- */

#header
<<
#define ZZCOL
#define USER_ZZSYN

#include "attrib.h"
#include "lex_auxiliary.h"
#include "error.h"
#include "bibtex_ast.h"
>>

/* 
 * lexical classes (ie. lexer modes)
 *
 *    START       - between entries     
 *    LEX_ENTRY   - reading '@', entry type, and '{' or '('
 *    LEX_KEY     - reading entry key ','
 *    LEX_FIELD   - reading field name '=' field value
 *    LEX_STRING  - reading quoted string (quote characters can be either
 *                  "" or {})
 */


/* in between entries, we pay attention to '@' and newline only -- anything
 * else is ignored. 
 */

#token        "\n"         << newline (); >>
#token AT     "\@"         << zzmode (LEX_ENTRY); >>
#token        "~[\n\@]+"   << zzskip (); >>


#lexclass LEX_ENTRY

/*
 * In LEX_ENTRY mode, we're looking for the start of the entry (ie. the
 * tokens immediately following an `@').  This is just the entry 
 * type (a BibTeX keyword) followed by the entry "opener" ('{' or '(').
 *
 * A BibTeX keyword is a string of any characters except whitespace and
 * the following:
 *    " # % ' ( ) , = { } 
 * Also, digits are excluded from the first character of a keyword.  Note
 * that this definition is a little stricter than that used by BibTeX
 * itself; however, BibTeX is definitely too loose.  For instance, if
 * you put a '{' in the entry key, BibTeX will happily generate 
 * bogus TeX code.
 */

#token              "\n"         << newline (); >>
#token              "[\ \t]+"    << zzskip (); >>
#token ENTRY_TYPE   "~[0-9\"\#\%\'\(\)\,\=\{\}\ \n\t]~[\"\#\%\'\(\)\,\=\{\}\ \n\t]*"
                                 << set_entry_type (zzlextext); >>
#token ENTRY_OPEN   "[\(\{]"     << open_entry (*zzlextext); >>


#lexclass LEX_KEY

/*
 * In LEX_KEY mode, we read the entry key, which is a BibTeX keyword just
 * like the entry type.
 */

#token              "\n"         << newline (); >>
#token              "[\ \t]+"    << zzskip (); >>
#token ENTRY_KEY    "~[0-9\"\#\%\'\(\)\,\=\{\}\ \n\t]~[\"\#\%\'\(\)\,\=\{\}\ \n\t]*"
                                 << zzmode (LEX_FIELD); >>


#lexclass LEX_FIELD

/* 
 * This is the lex mode active inside an entry, except within quoted
 * strings.  Thus, we treat newlines and whitespace as usual, and in
 * addition recognise "keywords" (field names or macro names, lexically
 * identical to entry types and keys), numbers, and know enough to farm
 * quoted strings off to the LEX_STRING class.
 */

#token              "\n"         << newline (); >>
#token              "[\ \t]+"    << zzskip (); >>
#token KEYWORD      "~[0-9\"\#\%\'\(\)\,\=\{\}\ \n\t]~[\"\#\%\'\(\)\,\=\{\}\ \n\t]*"
#token EQUALS       "="

#token HASH         "\#"
#token COMMA        ","
#token NUMBER       "[0-9]+"
#token              "[\{\"]"     << start_string (*zzlextext); >>

#token ENTRY_CLOSE  "[\}\)]"     << close_entry (*zzlextext); >>


#lexclass LEX_STRING

/*
 * Here's a reasonably decent attempt at lexing BibTeX strings.  All we
 * really do with whitespace here is turn tabs and newlines into spaces --
 * we don't actually do any whitespace collapsing or deletion, or deletion
 * of quotes.  That comes later, in a post-processing step
 * (postprocess_string(), called from either close_brace() or
 * quote_in_string()).  This has the advantage of making the lexer simpler,
 * and also gives us more flexibility -- we (or the user) can choose
 * whether to collapse whitespace or delete quotes.
 *
 * Also, to be strictly compatible with BibTeX, handling of double-quotes
 * needs to be more general -- BibTeX actually allows them inside
 * double-quoted strings, *as long as they're within a brace-protected
 * substring*.  That is, BibTeX thinks
 *   field = "hello {"there"}"
 * is legitimate.  Ugh.  (BibTool barfs on that, so I can probably get away
 * with doing so as well.  ;-)
 *
 * There's some sleight-of-hand going on here to deal with escaped double
 * quotes.  In short, we have to deal with something like this: 
 *     author = "Sch\"{o}nauer}, Willi" 
 * without getting confused about the double quote in a doubly-quoted
 * string.  Hence, we have one regexp to explicitly handle any
 * two-character TeX command sequence, and we explicitly exclude backslash
 * from the main fallthrough regexp (so that we don't inadvertently slurp
 * the backslash part of a command sequence).
 */

#token        "\n"               << newline_in_string (); >>
#token        "\t"               << zzreplchar (' '); zzmore (); >>
#token        "\{"               << open_brace (); >>
#token        "\}"               << close_brace (); >>
#token        "\\~[]"            << zzmore (); >>
#token STRING "\""               << quote_in_string (); >>
#token        "~[\n\t\{\}\"\\]+" << zzmore (); >>

#lexclass START


/* At last, the grammar!  After that lexer, this is a snap. */

bibfile!     : << AST *last; #0 = NULL; >>
               ( entry[0]
                 <<                       /* a little creative forestry... */
                    if (#0 == NULL)
                       #0 = #1;
                    else
                       last->right = #1;
                    last = #1;
                 >>    
               )* ;

entry [int cur_line]
             : << 
                  if ($cur_line > 0) zzline = $cur_line;
               >>
               AT! ENTRY_TYPE^
               ENTRY_OPEN! contents ENTRY_CLOSE!
               <<
                  switch (EntryMetatype)
                  {
                     case ETYPE_STRUCTURED: #1->nodetype = AST_ENTRY; break;
                     case ETYPE_MACRODEF  : #1->nodetype = AST_MACRODEF; break;
                     case ETYPE_TEXT      : #1->nodetype = AST_TEXT; break;
                  }
               >>
             ;

contents     : ENTRY_KEY
               << 
                  if (strlen ($1.text) == 0)
                     lexical_warning ("empty key");
                  #1->nodetype = AST_KEY;
               >> 
               COMMA!
               fields
             | << EntryMetatype == ETYPE_MACRODEF >>? field
             ;

/*
 * fields -- comma-separated list of fields.  Note that BibTeX has a little
 * wart in that it allows a single extra comma after the last field only.
 * Easy enough to handle, we just have to do it in the traditional BNFish
 * way (loop by recursion) rather than using EBNF trickery.
 */

fields       : field { COMMA! fields }
             | /* epsilon */
             ;

field        : KEYWORD^ EQUALS! field_data 
               << #1->nodetype = AST_FIELD; >>
               << 
#if DEBUG > 1
                  printf ("field: fieldname = %p (%s)\n"
                          "       first val = %p (%s)\n",
                          #1->text, #1->text, #2->text, #2->text);
#endif
               >>
             ;

field_data   : field_datum ( HASH! field_datum )* ;

field_datum  : STRING      << #1->nodetype = AST_STRING; >>
             | NUMBER      << #1->nodetype = AST_NUMBER; >>
             | KEYWORD     << #1->nodetype = AST_MACRO; >>
             ;
