This one work ok for now:
Code:
'Tokenizer for basic -like syntax in python
'PyQB tokenizer translation to Oxygen Basic - by Aurel 2018
#lookahead
string KEYWORDS[] = {"CLS","PRINT","IF","ELSE","FOR","TO","NEXT","ENDIF","WHILE","WEND","UNTIL","DO","LOOP","THEN"}
string SYMBOLS = ":=()+-*/<>"
string ALPHABETS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$#"
string NUMBERS = "0123456789"
string NUMBERS_WITH_DECIMALPOINT = NUMBERS + "."
string ALPHANUMBERS = ALPHABETS + NUMBERS
string ALPHANUMBERS_WITH_UNDERSCORE = ALPHANUMBERS + "_"
string tokens 'token buffer 'tokens[1024] ' token list
string crlf = chr(13)+chr(10)
function tokenizer(string code) as string
string token, ch
'
'load file?
'
INT i,j
'................................
'print str(len(code))
'.................................
i=1
WHILE i <= len(code)
IF instr(ALPHABETS, mid(code,i,1)) <> 0 'isAlpha
while i <= len(code) and INSTR(ALPHABETS ,mid(code,i,1)) > 0
token = token + mid(code, i, 1)
i=i+1
wend
' PRINT str i
'print token
if ucase(token)= isKeyword(token) ' search keyword list
tokens = tokens + token + " : KEYWORD" + crlf
token=""
else
tokens = tokens + token + " : IDENTIFIER" + crlf 'variabe
token=""
end if
'token=""
'i=i+1
END IF
IF i <= len(code) and instr(SYMBOLS, mid(code, i, 1)) > 0 'sym operators
token = mid(code, i, 1)
'print token
tokens = tokens + token + " : SYMBOL" + crlf
i=i+1
token=""
END IF
IF instr(NUMBERS, mid(code, i, 1)) <> 0 'numbers
while i <= len(code) and INSTR(NUMBERS_WITH_DECIMALPOINT,mid(code,i,1)) <> 0
token = token + mid(code,i,1)
i=i+1
wend
tokens = tokens + token + " : NUMBER" + crlf
token=""
END IF
'elseif ch = chr(34) 'quote "
'token = ""
'i = i + 1
'j=1
'while i+j < len(code) and mid(code,i+1,1) <> chr(34) 'string literal""
'token = token + mid(code,i,1)
'j=j+1
'tokens = tokens + token + " :STRING-LITERAL" + crlf
'i = i + j + 1
'wend
if i <= len(code) and mid(code, i, 1) = " " 'whitespace
'token=""
i=i+1
end if
'elseif ch = chr(10)
'tokens = tokens + ch + " :NEWLINE" + crlf
'i=i+1
'else
'tokens = tokens + ch + " :UNINDENTIFIED - ERROR!" + crlf
'i=i+1
' i=i+1 ' increase main iterator
WEND
Return tokens
end function
'...........................................................
function isKeyword(byval tok as string) as string
'string ret
for n = 1 to 14
if ucase(tok) = KEYWORDS[n] ' if is KEYWORD
RETURN KEYWORDS[n]
end if
next n
Return ""
end function
'.............................................................
'test tokenizer
string tokenList
string input = "For n= 10 To 100 :a= a*0.35 : Next n "
'call tokenizer
tokenList = tokenizer(input)
print tokenList