/*  $Header: lexical.c,v 3.1 89/05/25 11:06:58 jos Exp $ */
/*
 *  This file is part of the Amsterdam SGML Parser.
 *
 *  Copyright: Faculteit Wiskunde en Informatica
 *             Department of Mathematics and Computer Science
 *             Vrije Universiteit Amsterdam
 *             The Netherlands
 *
 *  Authors:   Sylvia van Egmond
 *             Jos Warmer
 */
#include "types.h"
#include "Lpars.h"
#include "charclas.h"
#include "conc_syn.h"
#include "in.h"
#include "keywords.h"
#include "lexical.h"
#include "mode_stk.h"
#include "modes.h"
#include "shortref.h"
#include "symtable.h"

#ifdef DOC_PARSER
#include "startend.h"
#include "tags.h"
#endif

extern int LLsymb;

#define KEY_LENGTH   30

#ifdef DOC_PARSER
static STACK  net_stack;
static Token  last_net_token = TOK_NOD;
static Bool   net_used       = FALSE;
Bool          in_starttag    = FALSE;
Bool          in_endtag      = FALSE;
#endif
String        current_element_name;

typedef  int (*IntFunctionChar)(PAR  char  RAP);

#ifdef DEBUG
static Bool   debug = FALSE;
static void   print_token(PAR  int token, int value  RAP);
#endif

static int               token_string[2];
static int               char_ref_str[2];
static char              input_values[]        = {0, 0};
static Bool              in_entity_declaration = FALSE;
static Token             token;
static int               dont_try_sgmlkey;

static int               value;

static Delimiter       *delimiters;
static RecognitionMode  recog_mode;
static ModeStack        mode_stack;

static void  scan              ();
static void  get_keyword       (PAR  String  RAP);
static Token charclass         (PAR  int    RAP);
static int   match_constraint  (PAR  int tok  RAP);
static int   match_more        (PAR  int, int, ...  RAP);
static int   match             (PAR  int try_token  RAP);
static int   match_delim       (PAR  int try_token  RAP);
static int   match_shortref    (PAR  int shref  RAP);
#ifdef DOC_PARSER
static int   skip_s            ();
#endif
 

char Gatt_name[80];
char Gatt_value[80];

/* This should go together with the part in 'lexical.c'
 */

#ifdef DOC_PARSER
int save_token;
int save_value;

static void save_environ()
{
    save_token = token;
    save_value = value;
}
static void reset_environ()
{
    token = save_token;
    value = save_value;
}

static void start_tag_attributes(name)
String name;
{
    /* make next_token return -1 when anything ending the tag is found.
     */
    save_environ();
    in_starttag = TRUE;
    net_used    = FALSE;
    enter_mode(MODE_TAG);
    current_element_name = name;
    pushback_ch(1);
    att_spec_list();
    next_ch();
    in_starttag = FALSE;
    leave_mode();
    reset_environ();
}
#endif

is_blank(ch)
int ch;
{
    return (is_SPACE(ch) or is_SEPCHAR(ch));
}

static Delimiter scan_delimiter()
/*
 *  The correctness of this procedure depends on the fact that the delimiters
 *  are stored in order of decreasing length of the string assigned by them
 *  by the concrete syntax.
 */
{
    int           i;
    Delimiter     delim;
    Token         answer = TOK_NOD;
    int           ch;
    Bool          try_shortref;

    ch = next_ch();
    if( ch <  0   ){
	answer = ch;
    } else if( ch >= 256 ){     /* a token! */
	answer = ch;
	value  = current_ch_value();
	/*
	 *  A token that is put on the input queue, should not be tested
	 *  for matching constraints. It must be Ok.
         *
	 */
	next_ch();
	answer = match_constraint(answer);
	if( answer == TOK_NOD ){
	    pushback_ch(1);
	    answer = current_ch();
	    /*report(SYSTEM_CONSTR, FATAL, 0, 0, current_ch()); */
	} else {
	    pushback_ch(1);
	}
	/*
         */
    } else {
	if( is_delim_char(ch) ){
	    try_shortref = is_current_shortref_char(ch);
	    for(i=0; (delim = delimiters[i]) != TOK_NOD ; i++){
		if( is_shortref(delim) ){
		    if( try_shortref ){
			if( (answer = match_shortref(delim)) != TOK_NOD ){
			    break;
			}
		    } else {
			break;
		    }
		} else {
		    if( (answer = match_delim(delim)) != TOK_NOD ){
			break;
		    }
		}
	    }
	}
	if( answer == TOK_NOD ){
	    answer = charclass( current_ch() );
	    value  = current_ch();
	} else {
	    pushback_ch(1);
	}
    }

    return answer;
}

static void scan()
{
    Delimiter delim = TOK_NOD;
    char      key[KEY_LENGTH+1];

    token = scan_delimiter();
#ifdef DOC_PARSER
    if( in_starttag and 
	( (token==TOK_STAGO) or (token==TOK_ETAGO)or
	  (token==TOK_NET)or(token==TOK_TAGC)) )
    {
	DEB("scan: in_starttag and S/ETAGO\n");

	if( token==TOK_NET ){ net_used = TRUE; }
	if( (token==TOK_STAGO) or (token==TOK_ETAGO) ){
	    pushback_ch( token_length(token, 0) );
	};
	token = -1; 	/* EOFILE */
	done_ch();
	return;
#else
    dont_try_sgmlkey--;
    if( (current_mode()==MODE_SGML) and
	     is_name_start_character(current_ch()) and
	     (dont_try_sgmlkey <= 0) )
    {
	DEB("trying keyword in SGML-decl\n");

        get_keyword(key);
        token = sgml_string_to_key(key);
	if(  token != TOK_NOD ){
	    DEB1("found keyword %s in SGML-decl\n", key);
	    pushback_ch(1);
            value = 0;
	} else {
	    DEB1("wrong keyword %s in SGML-decl\n", key);
	    pushback_ch( strlen(key) );
	    dont_try_sgmlkey = strlen(key);
	    token = charclass(current_ch());
	    value = current_ch();
	}
#endif
    }
    done_ch();
}

static Token charclass(ch)
int ch;
{
    if( ch == EOF                            ){ return  EOF        ;  }
    if( is_NONSGML(ch)                       ){ return  TOK_NONSGML;  }
    if( is_UC_LETTER(ch) or is_LC_LETTER(ch) ){ return  TOK_LETTER;   }
    if( is_DIGIT(ch)                         ){ return  TOK_DIGIT ;   }
    if( is_SPACE(ch)                         ){ return  TOK_SPACE ;   }
    if( is_SEPCHAR(ch)                       ){ return  TOK_SEPCHAR;  }
    if( is_RS(ch)                            ){ return  TOK_RS     ;  }
    if( is_RE(ch)                            ){ return  TOK_RE     ;  }
    if( check_int(current_mode(), MODE_LIT_ERO, MODE_LITA_ERO,
		   MODE_LIT_PERO, MODE_LITA_PERO, 0) and 
	is_SPECIAL(ch) ){ return  TOK_SPECIAL;  }
    if( is_name_start_character(ch)          ){ return  TOK_NMSTRT;   }
    if( is_name_character(ch)                ){ return  TOK_NMCHAR;   }
    return TOK_DATACHAR;
}

Token    next_token()
{
    scan();
#ifdef DEBUG
    if( debug ){
	fprintf(fpdg,"(%s): ",recognition_to_string(current_mode()));
	print_token(token, value);
    }
#endif
    /*
    while( token == TOK_NONSGML ){
	fprintf(stderr,"%s: NONSGML character number %d\n", input_location(),
                        current_value());
	scan();
    }
    */
    return( token );
}

Token    current_token()
{
    return( token );
}

int     current_value()
{
    return( value );
}

void     enter_mode( m )
RecognitionMode  m;
{
    DEB1("enter_mode: %s\n", recognition_to_string(m) );

    PushMode(mode_stack, recog_mode);
    recog_mode = m;
    delimiters = delimiter_mode(m);
    if( m == MODE_SGML ){
	dont_try_sgmlkey = 0;
    }
}

void     leave_mode( )
{
    DEB1("leave_mode: %s\n", recognition_to_string(recog_mode));

    recog_mode = PopMode(mode_stack );
    delimiters = delimiter_mode(recog_mode);
}

RecognitionMode current_mode()
{
    return recog_mode;
}

void push_back_token()
{
    Token  tokens[2];
    int    values[2];
    DEB2("push_back_token: token %d, value %d\n", current_token(), current_value());

    tokens[0] = current_token();
    values[0] = current_value();
    tokens[1] = 0;
    values[1] = 0;
    if( current_token() == TOK_RS ){
	token_input("TOK_RS", tokens, values);
    } else {
	token_input("pushback_token", tokens, values);
    }
}

static void  get_keyword(key)
String key;
{
    int  c;
    int  i = 0;

    c = current_ch();
    while( is_alpha(c) ){
	if( i != KEY_LENGTH ){
	    key[i++] = c;
	}
	c = next_ch();
    }
    key[i] = '\0';
}

#ifdef DOC_PARSER
static void  get_name(key)
String key;
{
    int  c;
    int  i=0;
    Bool error = FALSE;

    c = current_ch();
    if( not is_name_start_character(c) ){
	key[0] = '\0';
	pushback_ch(1);
	return;
    }
    while( is_name_character(c) ){
	if (!error and (i >= NAMELEN)) {
	    key[i] = '\0';
	    report(WAR_NAMELEN, NOTFATAL, 0, 0, key, NAMELEN);
	    error = TRUE;
	} else if (!error) {
	    key[i++] = c;
	}
	c = next_ch();
    }
    key[i] = '\0';
    if( NAMECASE_GENERAL ){ to_upper(key); }
}
#endif

static int refc(ch)
int ch;
{
    return( (ch=='\r') || (ch==';') );
}

void tail(name, is_ok)
String           name;
IntFunctionChar  is_ok;
{
    int  ch;
    int  i = 1;
    Bool error = FALSE;

    ch = next_ch();
    while( (*is_ok)(ch) ){
	if (!error and (i >= NAMELEN)) {
	    name[i] = '\0';
	    report(WAR_NAMELEN, NOTFATAL, 0, 0, name, NAMELEN);
	    error = TRUE;
	} else if (!error) {
	    name[i++] = ch;
	}
	ch = next_ch();
	DEB1("name_tail: next_ch = %c\n", ch);
    }
    name[i] = '\0';
    pushback_ch(1);
}

void name_tail(s)
String s;
{
    tail(s, is_name_character);
}

void number_tail(s)
String s;
{
    tail(s, is_DIGIT);
}

void  param_name(name)
String name;
{
    int i=0;
    int ch;

    ch = next_ch() ;
    DEB1("param_name: next_ch = %c\n", ch);

    if( not is_name_start_character(ch) ){
	strcpy(name, "DEFAULT");
	return;
    } else {
	name[0] = ch;
	DEB1("param_name: first_ch = %c\n", ch);
    }
    name_tail(name);

    ch = next_ch();
    if( !refc(ch) ){
	pushback_ch(1);
	DEB("param_name: push back one charatcer( no refc)\n");
    }
    dont_try_sgmlkey -= (strlen(name) -1);
    done_ch();
    if( NAMECASE_ENTITY ){
	to_upper(name);
    }
}

void char_ref()
{
    Bool digit = FALSE;
    int  i=0;
    int  ch;
    int  replace_val = '\0';
    char str[80];

    ch = next_ch() ;
    DEB1("char_ref: next_ch = %c\n", ch);
    
    if( is_name_start_character(ch) ){
	str[0] = ch;
	name_tail(str);
	to_upper(str);
	if( (replace_val = function_char(str)) == '\0' ){
	    report(CHREF_UNKNOWN, FATAL, 0, 0, str);
#ifdef DEBUG
	} else {
	    DEB2("char_ref: function name `%s' value %d\n", str, replace_val);
#endif
	}
    } else if( is_DIGIT(ch) ){
	digit = TRUE;
	str[0] = ch;
	number_tail(str);
	DEB1("char_ref: number `%s' \n", str);
    } else {
	report(CHREF_ERROR, FATAL, 0, 0);
    }

    ch = next_ch();
    if( !refc(ch) ){
	pushback_ch(1);
	DEB("char_ref: push back one character( no refc)\n");
    }
    dont_try_sgmlkey -= (strlen(str) -1);
    done_ch();

    if( replace_val != '\0' ){
        input_values[0] = replace_val;
        string_input(input_values);
    } else if( digit ) {
	char_ref_str[0] = atoi(str);
	token_string[0] = TOK_DATACHAR;
	token_string[1] = 0;
	char_ref_str[1] = 0;
	token_input("char_ref", token_string, char_ref_str);
    }
}

void init_lexical()
{
    recog_mode = MODE_CON;
    mode_stack = CreateModeStack(-1);
#ifdef DOC_PARSER
    net_stack  = (STACK)CreateElemStack(-1);
#endif
    enter_mode( MODE_CON);
    init_charclas();
}

#ifdef DEBUG
void debug_lexical(bool)
Bool bool;
{
    debug = bool;
}
#endif

/*
 *  The following procedure is called from an entity declaration to 
 *  avoid an error in the standard:
 *  According to the rules in the standard in section 9.6, concerning 
 *  delimiter recognition, a PERO delimiter can only be recognized if 
 *  it is directly followed by a 'name start character'.
 *  In the declaration of a parameter entity the PERO must be followed 
 *  by one or more 'parameter seperators (see standard section 10.5.1,
 *  page 38). This is contradictory.
 *  The fix is that inside an entity declaration the function 'pero' is 
 *  called to tell the constraint resolver that the constraint does not 
 *  hold.
 */
void pero(b)
Bool b;
{ in_entity_declaration = b; }

static int match_constraint(tok)
Token tok;
{
    int          ch;
    int          i = 0;
#ifdef DOC_PARSER
    Parserinfo   info;
#endif
    char         key[KEY_LENGTH+1];
    char         gi_name[80];
    Token        answer;

    ch = current_ch();
    DEB2("match_constraint: tok %d ch %d\n", tok, ch);

    switch(constraint(tok)){
	case C_NO :
	    answer = tok;
	    break;
	case C_CREF :				/* character reference */
	    if( is_name_start_character(ch) ){
	 	answer = tok;  /*get_name(value.string);*/
	    } else if( is_DIGIT(ch) ){
		answer = tok;  /*get_number(value.string);*/
	    } else {
		return TOK_NOD;
	    }
	    break;
	case C_DCL  :
	    if( is_name_start_character(ch) ){
                get_keyword(key);
	        answer = mdo_keyword(key);
	        if( answer == TOK_NOD ){    	/*  illegal keyword  */
		    report(KEY_WRONG, FATAL, 0, 0, key, "declaration",
			   "ELEMENT, ATTLIST, etc...", "nothing");
		    /* pushback_ch( strlen(key) ); */
		    answer = TOK_MDO;
	        }
	    } else {
		switch( match_more(TOK_COM, TOK_DSO, TOK_MDC, TOK_NOD) ){
		    case TOK_COM : answer = MDO_COM;
				   break;
		    case TOK_DSO : answer = TOK_MDO_DSO;
				   break;
		    case TOK_MDC : answer = MDO_MDC;
				   break;
		    default      : answer = TOK_NOD;
				   break;
		}
	    }
	    break;
	case C_GI   :
#ifdef GENERATOR
            answer = TOK_NOD;
#else
	    if( in_starttag or in_endtag ){
		answer = tok;
		break;
	    }
	    if( is_name_start_character(ch) ){
		get_name(gi_name);
		if( tok == TOK_STAGO ){
		    answer = starttag_keyword(gi_name);
		    start_tag_attributes(gi_name);
		    if( answer == TOK_NOD ){
			report(TAG_UNKNOWN, FATAL, 0, 0, "start", gi_name);
			done_ch();
			for(i=0; i<token_length(tok, 0); i++){
			    next_ch();
			}
		    } else {
			if( net_used ){
			    DEB1("last_net = %s\n", starttag_to_string(answer));
			    last_net_token = answer;
			}
		    }
		} else {		/* ETAGO */
		    Token tmp;

		    answer = endtag_keyword(gi_name);
                    i = skip_s();
		    in_endtag = TRUE;
		    tmp=match_more(TOK_TAGC,TOK_STAGO,TOK_ETAGO,TOK_NOD);
		    in_endtag = FALSE;
		    switch(tmp){
		    case TOK_TAGC : break;
		    case TOK_STAGO:
		    case TOK_ETAGO: pushback_ch( token_length(tmp, 0) );
				    break;
		    case TOK_NOD  : report(TAG_NO_CLOSE, FATAL, 0, 0);
				    pushback_ch(i);
				    break;
		    default       : report(NO_LABEL, FATAL, 0, 0, 0, "scan");
				    break;
		    }
		    if( answer == TOK_NOD ){
			report(TAG_UNKNOWN, FATAL, 0, 0, "end", gi_name);
			done_ch();
			for(i=0; i<token_length(tok, 0); i++){
			    next_ch();
			}
		    }
		}
	    } else {
	        i = skip_s();
	        if( match(TOK_TAGC) == TOK_TAGC ){
		    info = last_opened_info();
		    answer=((tok==TOK_STAGO)?info_starttag(info)
					    :info_endtag(info)  );
		    if( answer == 0 ){
			if( tok == TOK_STAGO ){
			    answer = document_start(); /* DOCUMENT */
			} else {			/* ETAGO */
			    report(TAG_EMPTY, FATAL, 0, 0, "end");
			    answer = TOK_NOD;
			    done_ch();
			    for(i=0; i<token_length(tok, 0); i++){
				next_ch();
			    }
			}
		    }
	        } else {
		    pushback_ch(i);
		    answer = TOK_NOD;
	        }
	    }
#endif
	    break;
	case C_ELEM :
#ifdef GENERATOR
	    answer = TOK_NOD;
#else
	    if( in_starttag ){
		answer = tok;
	    } else if( Size_elemstack(net_stack) == 0 ){
		answer = TOK_NOD;
	    } else {
		DEB1("pop net_stack element %s\n",
					endtag_to_string(TopElem(net_stack)));
		answer = TopElem(net_stack);
	    }
#endif
	    break;
	case C_MSE  :
	    if( match_delim(TOK_MDC) != TOK_NOD ){
		answer = TOK_MSC_MDC;
	    } else {
		answer = TOK_NOD;
	    }
	    break;
	case C_NMS  :
	    if( is_name_start_character(ch) ){
		answer = tok;
	    } else {
		answer = TOK_NOD;
	    }
	    break;
	default:
	    report(NO_LABEL, FATAL, 0, 0, constraint(tok), "match_constraint");
	    break;
    }
    return answer;
}

#ifdef DOC_PARSER
static int skip_s()
{
    int ch;
    int i=0;

    ch  = current_ch();
    while( is_s(ch) ){
      ch = next_ch();
      i++;
    }
    DEB1("skip_s: %d\n", i);
    return i;
}
#endif

/*
 *  tries to match 'try_token' in input stream.
 *  PRE: current_ch() is to be matched.
 *  POST: current_ch() is to be matched.
 */
static int match_delim(try_token)
Token      try_token;
{
    String str;
    int    i = 0;
    int    ch;
    Token  answer;

    if( try_token == TOK_NOD ){ return TOK_NOD; }

    str = delimiter_to_string(try_token);
    ch  = current_ch();
    while( ch == (*str) ){ /* try to match try_token */
      str++;
      ch = next_ch();
      i++;
    }
    if( (*str)=='\0' ){  		/* try_token recognized */
	answer = match_constraint(try_token);
	if( answer == TOK_NOD ){
	    if( (try_token == TOK_PERO) and in_entity_declaration ){
		answer = TOK_PERODEF;
	    } else {
		pushback_ch(i);
	    }
	}
	return answer;
    } else {
	pushback_ch(i);
	return TOK_NOD;
    }
}

static int match_shortref(shref)
Token shref;
{
    int      i = 0, ch, nr_blanks = 0;
    String   str;
    Token    answer;
    Bool     found = FALSE;

    if( in_current_map(shref) ){
	str = delimiter_to_string(shref);
	ch = current_ch();

	while( (*str == ch) or ((*str=='B') and is_blank(ch) ) ) {
	    if( *str == 'B' ){
		nr_blanks++;
		if( nr_blanks >= BSEQLEN ){
		    str++;
		} else if( *(str+1) == 'B' ){
		    str++;
		} else if( not is_blank(next_ch()) ) {
		    pushback_ch(1);
		    str++;
		} else {
		    pushback_ch(1);
		}
	    } else {
		str++;
	    }
	    ch = next_ch();
	    i++;
	}

	if( (*str == '\0') or found ){		/* matched */
	    answer = TOK_SHORTREF;
	    value  = shref;
	} else {
	    pushback_ch(i);
	    answer = TOK_NOD;
	}
    } else {
	answer = TOK_NOD;
    }
    return answer;
}

static int match(try_token)
Token try_token;
{
    Token answer;

    if( is_shortref(try_token) ){
	answer = match_shortref(try_token);
    } else {
	answer = match_delim(try_token);
    }
    return answer;
}

static int match_more(t1, t2, t3, t4, t5)
Token t1, t2, t3, t4, t5;
{
    if( match(t1)==t1 ){ return t1; }
    if( match(t2)==t2 ){ return t2; }
    if( match(t3)==t3 ){ return t3; }
    if( match(t4)==t4 ){ return t4; }
    if( match(t5)==t5 ){ return t5; }
    return TOK_NOD;
}

#ifdef DEBUG
static void print_token(token, value)
Token token;
int   value;
{

    fprintf(fpdg,"next token: %s\n",token_to_string(token, value) );
    fflush(fpdg);
}
#endif

#define STRING_LENGTH   40

LLmessage(tok)
int   tok;
{
    char  delete_string[STRING_LENGTH+1];
    char  message[80];
    int   i = 0;

    switch(tok){
	case  -1 :
	    sprintf(message, "%s", token_to_string(LLsymb, current_value()));
	    report(LLGEN_EOF, FATAL, 0, 0, message);
	    break;
	case   0 :
	    if( check_int(LLsymb, TOK_LETTER, TOK_DIGIT, TOK_NMCHAR,
				  TOK_NMSTRT, TOK_SPECIAL, TOK_DATACHAR, 0))
	    {
		delete_string[i++] = current_value();
		while( next_token() == LLsymb ){
		    delete_string[i++] = current_value();
		    if( i == STRING_LENGTH ){
		        delete_string[i] = '\0';
		        report(LLGEN_DEL_CHARS, FATAL, 0, 0, delete_string);
			i = 0;
		    }
	        }
	        push_back_token();
		if( i != 0 ){
		    delete_string[i] = '\0';
		    report(LLGEN_DEL_CHARS, FATAL, 0, 0, delete_string);
		}
	    } else {
	        sprintf(message, "%s",token_to_string(LLsymb, current_value()));
#ifdef DEBUG
	        if( debug ){
		    sprintf(message, "%s (%d)",
			    token_to_string(LLsymb, current_value()),LLsymb);
	        }
#endif
	        report(LLGEN_DEL, FATAL, 0, 0, message);
	    }
	    break;
	default  :
	    push_back_token();
	    switch(tok){
	        case TOK_LETTER   : value = 'A'; break;
	        case TOK_DIGIT    : value = '1'; break;
	        case TOK_NMCHAR   : value = 'C'; break;
	        case TOK_NMSTRT   : value = 'C'; break;
	        case TOK_SPECIAL  : value = '='; break;
	        case TOK_DATACHAR : value = 'C'; break;
	        default           : value = 0;   break;
	    }
	    sprintf(message, "%s", token_to_string(tok, value));
#ifdef DEBUG
	    if( debug ){
	        sprintf(message, "%s (%d)", token_to_string(tok, value),tok);
	    }
#endif
	    report(LLGEN_INSERT, FATAL, 0, 0, message);
	    break;
    }
}

#ifdef DOC_PARSER
int handle_net_start(starttag, endtag)
Token starttag;
Token endtag;
{
    if( last_net_token == starttag ){
	last_net_token = TOK_NOD;
	DEB1("push net_stack element %s\n",
				endtag_to_string(endtag));
	PushElem(net_stack, endtag);
	return -endtag;
#ifdef DEBUG
    } else {
	DEB1("handle_net: nothing done for %s\n", endtag_to_string(endtag));
#endif
    }
    return endtag;
}

int handle_net_end(endtag)
Token endtag;
{
    if( endtag < 0 ){
	PopElem(net_stack);
	return -endtag;
    }
    return endtag;
}
#endif
