User:Antigng/AF/AFTokenizer

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "AFTokenizer.h"
#include "AFParser.h"
#include "mem.h"
#include "struct.h"

const unsigned char hex_decoding[]=
{
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 0, 0, 0, 0, 0,
	0,11,12,13,14,15,16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0,11,12,13,14,15,16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isValidForHex(ch) (hex_decoding[(unsigned char)(ch)])
const unsigned char idvalid[]=
{
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isValidForID(ch) (idvalid[(unsigned char)(ch)])
const unsigned char spacevalid[]=
{
	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isSpace(ch) (spacevalid[(unsigned char)(ch)])
#define forwardRead(s,c,i,o) (c=s[++i]+o)

static void throwError(int count,unsigned char symbol,const char *reason)
{
	char symText[16];
	if(!symbol)
	{
		sprintf(symText,"EOL");
	}
	else if((symbol>=32)&&(symbol<=126))
	{
		sprintf(symText,"'%c'",symbol);
	}
	else
	{
		sprintf(symText,"'\\u%02x'",symbol);
	}
	fprintf(stderr,"Lexer: Error at char number %d: %s - %s.\n",count,symText,reason);
	return;
}

static char *text=NULL;
static int top=0;
static int numLexer(const char *source,int start,struct _AFToken *tok_p)
{
	unsigned char ch;
	int count=0;
	int type=0;
	int ivalue;
	double fvalue,mul;
	switch(start)
	{
	case 0:
		ivalue=0;
		goto _DFAState_2;
		break;
	case 1:
		goto _DFAState_4;
		break;
	case 2:
		ivalue=source[0]-'0';
		break;
	default:
		goto _DFAFailure;
	}
_DFAState_1:
	forwardRead((unsigned char)source,ch,count,(-'0'));
	if(ch<10)
	{
		ivalue=ivalue*10+ch;
		goto _DFAState_1;
	}
	else if(ch!=(unsigned char)('.'-'0'))
	{
		goto _DFAWrapup;
	}
_DFAState_2:
	forwardRead((unsigned char)source,ch,count,(-'0'));
	if(ch<10)
	{
		type=1;
		fvalue=(double)ivalue+(double)ch*0.1;
		mul=0.1;
	}
	else
	{
		goto _DFAFailure;
	}
_DFAState_3:
	forwardRead((unsigned char)source,ch,count,(-'0'));
	if(ch<10)
	{
		mul=mul*0.1;
		fvalue+=mul*(double)ch;
		goto _DFAState_3;
	}
	else
	{
		goto _DFAWrapup;
	}
_DFAState_4:
	forwardRead(source,ch,count,0);
	switch(ch)
	{
	case '.':
		ivalue=0;
		goto _DFAState_2;
		break;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		ivalue=ch-'0';
		goto _DFAState_1;
		break;
	case 'b':
		ivalue=0;
		goto _DFAState_7;
		break;
	case 'o':
		ivalue=0;
		break;
	case 'x':
		ivalue=0;
		goto _DFAState_6;
		break;
	default:
		ivalue=0;
		goto _DFAWrapup;
	}
_DFAState_5:
	forwardRead((unsigned char)source,ch,count,(-'0'));
	if(ch<8)
	{
		ivalue=ivalue*8+ch;
		goto _DFAState_5;
	}
	else
	{
		goto _DFAWrapup;
	}
_DFAState_6:
	forwardRead(source,ch,count,0);
	if((ch=isValidForHex(ch))>0)
	{
		ivalue=ivalue*16+ch-1;
		goto _DFAState_6;
	}
	else
	{
		goto _DFAWrapup;
	}
_DFAState_7:
	forwardRead((unsigned char)source,ch,count,(-'0'));
	if(ch<2)
	{
		ivalue=ivalue*2+ch;
		goto _DFAState_7;
	}
	else
	{
		goto _DFAWrapup;
	}
_DFAWrapup:
	switch(source[count])
	{
	case 0:
	case '\t':
	case '\n':
	case '\v':
	case '\f':
	case '\r':
	case ' ':
	case '!':
	case '%':
	case '&':
	case '(':
	case ')':
	case '*':
	case '+':
	case ',':
	case '-':
	case '/':
	case ':':
	case ';':
	case '<':
	case '=':
	case '>':
	case '?':
	case '[':
	case ']':
	case '^':
	case '|':
		if(type)
		{
			tok_p->type=T_FLOAT;
			tok_p->value.f=fvalue;
		}
		else
		{
			tok_p->type=T_INT;
			tok_p->value.i=ivalue;
		}
		return count;
		break;
	}
_DFAFailure:
	return -1;
}
static int idLexer(const char *source,unsigned int *hash_p)
{
	int count=0;
	unsigned int hash=0;
	char ch=source[0];
	do
	{
		str_update_hash(hash,ch);
		text[top++]=ch;
		count++;
	}while(isValidForID(ch=source[count]));
	text[top++]=0;
	*hash_p=hash;
	return count;
}
static int stringLexer(const char *source,char start_ch)
{
	int count=-1;
	char num=0;
	char ch;
_DFAState_0:
	forwardRead(source,ch,count,0);
	if(!ch)
	{
		return -1;
	}
	else if(ch=='\\')
	{
	}
	else if(ch==start_ch)
	{
		text[top++]=0;
		return count;
	}
	else
	{
		text[top++]=ch;
		goto _DFAState_0;
	}
	forwardRead(source,ch,count,0);
	switch(ch)
	{
	case 0:
		return -1;
		break;
	case '\\':
	case '\'':
	case '\"':
		text[top++]=ch;
		goto _DFAState_0;
		break;
	case 'n':
		text[top++]='\n';
		goto _DFAState_0;
		break;
	case 'r':
		text[top++]='\r';
		goto _DFAState_0;
		break;
	case 't':
		text[top++]='\t';
		goto _DFAState_0;
		break;
	case 'x':
		text[top]='\\';
		text[top+1]='x';
		break;
	default:
		text[top]='\\';
		text[top+1]=ch;
		top+=2;
		goto _DFAState_0;
		break;
	}
	forwardRead(source,ch,count,0);
	if(!ch)
	{
		return -1;
	}
	text[top+2]=ch;
	if((ch=isValidForHex(ch))!=0)
	{
		num=ch-1;
	}
	else
	{
		top+=3;
		goto _DFAState_0;
	}
	forwardRead(source,ch,count,0);
	if(!ch)
	{
		return -1;
	}
	if((ch=isValidForHex(ch))>0)
	{
		text[top++]=num*16+ch-1;			
	}
	else
	{
		text[top+3]=ch;
		top+=4;
	}
	goto _DFAState_0;
}
static int skipOverSpaces(const char *source,char *x)
{
	int count=1;
	while(isSpace(source[count]))
	{
		count++;
	}
	*x=source[count];
	return count;
}
static int skipOverComments(const char *source)
{
	int count=0;
_DFAState_0:
	switch(source[count])
	{
	case 0:
		return -1;
		break;
	case '*':
		break;
	default:
		count++;
		goto _DFAState_0;
	}
	count++;
	switch(source[count])
	{
	case 0:
		return -1;
		break;
	case '/':
		return count+1;
		break;
	default:
		count++;
		goto _DFAState_0;
	}
}
static struct hashlist *AFKeywords=NULL;
static enum _AFKeywordType AFKeywordType[]=
{
	K_like,
	K_in,
	K_contains,
	K_rlike,
	K_irlike,
	K_if,
	K_then,
	K_else,
	K_end,
	K_true,
	K_false,
	K_null
};

/* Get next token*/
static const char *AFSource=NULL;
static int AFCount=0;
static char AFCur=0;
#define getIdWithKeywords() \
{\
	enum _AFKeywordType *key;\
	unsigned int hash;\
	int basecount=top;\
	int len=idLexer(AFSource+AFCount,&hash);\
	AFCount+=len-1;\
	if(str_hashquery_withhash(hash,AFKeywords,text+basecount,(void **)&key))\
	{\
		tok_p->type=T_KEYWORD;\
		tok_p->value.key=*key;\
	}\
	else\
	{\
		tok_p->type=T_ID;\
		tok_p->value.s.begin=text+basecount;\
		tok_p->value.s.offset=len;\
		tok_p->value.s.hash=hash;\
	}\
}
#define getIdWithoutKeywords() \
{\
	unsigned int hash;\
	int basecount=top;\
	int len=idLexer(AFSource+AFCount,&hash);\
	AFCount+=len-1;\
	tok_p->type=T_ID;\
	tok_p->value.s.begin=text+basecount;\
	tok_p->value.s.offset=len;\
	tok_p->value.s.hash=hash;\
}
#define setTokenWithOP(OP) \
{\
	tok_p->type=T_OP;\
	tok_p->value.op=OP;\
}
#define setTokenWithString() \
{\
	AFCount++;\
	{\
		int basetop=top;\
		int len=stringLexer(AFSource+AFCount,AFCur);\
		if(len>=0)\
		{\
			AFCount+=len;\
			tok_p->type=T_STRING;\
			tok_p->value.s.begin=text+basetop;\
			tok_p->value.s.offset=top-basetop-1;\
		}\
		else\
		{\
			throwError(AFCount,AFCur,"unclosed string");\
			return 1;\
		}\
	}\
}
#define setTokenWithNum(start) \
{\
	int len=numLexer(AFSource+AFCount,start,tok_p);\
	if(len>0)\
	{\
		AFCount+=len-1;\
	}\
	else\
	{\
		throwError(AFCount,AFCur,"invalid character in number");\
		return 1;\
	}\
}
int AFGetNextToken(struct _AFToken *tok_p)
{
_Tokenizer_Start:
	switch(AFCur)
	{
		/* EOF here */
	case 0:
		tok_p->type=T_NONE;
		return 0;
		break;

		/* spaces here */
	case '\t':
	case '\n':
	case '\v':
	case '\f':
	case '\r':
	case ' ':
		AFCount+=skipOverSpaces(AFSource+AFCount,&AFCur);
		goto _Tokenizer_Start;
		break;

		/* separators & operators here */
	case '!':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='=')
		{
			forwardRead(AFSource,AFCur,AFCount,0);
			if(AFCur=='=')
			{
				setTokenWithOP(O_SINEQ);
			}
			else
			{
				setTokenWithOP(O_INEQ);
				return 0;
			}
		}
		else
		{
			setTokenWithOP(O_NOT);
			return 0;
		}
		break;
	case '%':
		setTokenWithOP(O_REM);
		break;
	case '&':
		setTokenWithOP(O_AND);
		break;
	case '(':
		tok_p->type=T_BRA;
		break;
	case ')':
		tok_p->type=T_KET;
		break;
	case '*':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='*')
		{
			setTokenWithOP(O_EXP);
		}
		else
		{
			setTokenWithOP(O_MUL);
			return 0;
		}
		break;
	case '+':
		setTokenWithOP(O_PLUS);
		break;
	case ',':
		tok_p->type=T_COMMA;
		break;
	case '-':
		setTokenWithOP(O_MINUS);
		break;
	case '/':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='*')
		{
			int len;
			len=skipOverComments(AFSource+AFCount);
			if(len>=0)
			{
				AFCount+=len;
				AFCur=AFSource[AFCount];
				goto _Tokenizer_Start;
			}
			else
			{
				throwError(AFCount,AFCur,"unclosed comments");
				return 1;
			}
		}
		else
		{
			setTokenWithOP(O_DIV);
		}
		return 0;
		break;
	case ':':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='=')
		{
			setTokenWithOP(O_SET);
		}
		else
		{
			setTokenWithOP(O_TER_S);
			return 0;
		}
		break;
	case ';':
		tok_p->type=T_STATEMENT_SEPARATOR;
		break;
	case '<':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='=')
		{
			setTokenWithOP(O_LE);
		}
		else
		{
			setTokenWithOP(O_L);
			return 0;
		}
		break;
	case '=':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='=')
		{
			forwardRead(AFSource,AFCur,AFCount,0);
			if(AFCur=='=')
			{
				setTokenWithOP(O_SEQ);
			}
			else
			{
				setTokenWithOP(O_EQ);
				return 0;
			}
		}
		else
		{
			setTokenWithOP(O_EQ);
			return 0;
		}
		break;
	case '>':
		forwardRead(AFSource,AFCur,AFCount,0);
		if(AFCur=='=')
		{
			setTokenWithOP(O_GE);
		}
		else
		{
			setTokenWithOP(O_G);
			return 0;
		}
		break;
	case '?':
		setTokenWithOP(O_TER_Q);
		break;
	case '[':
		tok_p->type=T_SQUARE_BRA;
		break;
	case ']':
		tok_p->type=T_SQUARE_KET;
		break;
	case '^':
		setTokenWithOP(O_XOR);
		break;
	case '|':
		setTokenWithOP(O_OR);
		break;

		/* strings here */
	case '\"':
	case '\'':
		setTokenWithString();
		break;

		/* numbers here */
	case '.':
		setTokenWithNum(0);
		break;
	case '0':
		setTokenWithNum(1);
		break;
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		setTokenWithNum(2);
		break;

		/* ids & tokens here */
	case 'A':
	case 'B':
	case 'C':
	case 'D':
	case 'E':
	case 'F':
	case 'G':
	case 'H':
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P':
	case 'Q':
	case 'R':
	case 'S':
	case 'T':
	case 'U':
	case 'V':
	case 'W':
	case 'X':
	case 'Y':
	case 'Z':
	case '_':
	case 'a':
	case 'b':
		getIdWithoutKeywords();
		break;
	case 'c':
	case 'd':
	case 'e':
	case 'f':
	case 'g':
	case 'h':
	case 'i':
	case 'j':
	case 'k':
	case 'l':
	case 'm':
	case 'n':
	case 'o':
	case 'p':
	case 'q':
	case 'r':
	case 's':
	case 't':
	case 'u':
	case 'v':
	case 'w':
	case 'x':
	case 'y':
	case 'z':
		getIdWithKeywords();
		break;
	default:
		throwError(AFCount,AFCur,"invalid character");
		return 1;
		break;
	}
	forwardRead(AFSource,AFCur,AFCount,0);
	return 0;
}
void AFTokenizerIni()
{
	AFKeywords=hashini();
	str_hashadd(AFKeywords,"like",AFKeywordType+K_like);
	str_hashadd(AFKeywords,"matches",AFKeywordType+K_like);
	str_hashadd(AFKeywords,"in",AFKeywordType+K_in);
	str_hashadd(AFKeywords,"contains",AFKeywordType+K_contains);
	str_hashadd(AFKeywords,"rlike",AFKeywordType+K_rlike);
	str_hashadd(AFKeywords,"regex",AFKeywordType+K_rlike);
	str_hashadd(AFKeywords,"irlike",AFKeywordType+K_irlike);
	str_hashadd(AFKeywords,"if",AFKeywordType+K_if);
	str_hashadd(AFKeywords,"then",AFKeywordType+K_then);
	str_hashadd(AFKeywords,"else",AFKeywordType+K_else);
	str_hashadd(AFKeywords,"end",AFKeywordType+K_end);
	str_hashadd(AFKeywords,"true",AFKeywordType+K_true);
	str_hashadd(AFKeywords,"false",AFKeywordType+K_false);
	str_hashadd(AFKeywords,"null",AFKeywordType+K_null);
	return;
}
void AFTokenizerReset(const char *source,unsigned int source_len)
{
	static unsigned int text_limit=0;
	/* Reset text. Text cannot be longer than the source code. */
	if(text_limit<source_len)
	{
		s_free(text);
		text=(char *)s_malloc(sizeof(char)*source_len);
		text_limit=source_len;
	}
	top=0;
	/* Reset the pointer to the source code. */
	AFSource=source;
	AFCount=0;
	AFCur=*source;
	return ;
}