#ifdef _MSC_VER #pragma warning (disable:4244) #pragma warning (disable:4267) #pragma warning (disable:4102) #endif #include "nasal_lexer.h" #include "repl/repl.h" #include "util/util.h" #include "util/fs.h" namespace nasal { bool lexer::skip(char c) { return c==' ' || c=='\n' || c=='\t' || c=='\r' || c==0; } bool lexer::is_id(char c) { return (c=='_') || std::isalpha(c) || (c<0); } bool lexer::is_hex(char c) { return std::isxdigit(c); } bool lexer::is_oct(char c) { return '0'<=c && c<='7'; } bool lexer::is_dec(char c) { return std::isdigit(c); } bool lexer::is_str(char c) { return c=='\'' || c=='\"' || c=='`'; } bool lexer::is_quesmark(char c) { return c=='?'; } bool lexer::is_single_opr(char c) { return ( c=='(' || c==')' || c=='[' || c==']' || c=='{' || c=='}' || c==',' || c==';' || c==':' || c=='`' || c=='@' || c=='%' || c=='$' || c=='\\' ); } bool lexer::is_calc_opr(char c) { return ( c=='=' || c=='+' || c=='-' || c=='*' || c=='!' || c=='/' || c=='<' || c=='>' || c=='~' || c=='|' || c=='&' || c=='^' ); } void lexer::skip_note() { // avoid note, after this process ptr will point to '\n' // so next loop line counter+1 while(++ptr<res.size() && res[ptr]!='\n') {} } void lexer::err_char() { ++column; char c = res[ptr++]; err.err("lexer", {line, column-1, line, column, filename}, "invalid character 0x" + util::char_to_hex(c) ); ++invalid_char; } void lexer::open(const std::string& file) { if (repl::info::instance()->in_repl_mode && repl::info::instance()->repl_file_name==file) { err.load(file); filename = file; res = repl::info::instance()->repl_file_source; return; } if (file.empty()) { err.err("lexer", "empty input file"); err.chkerr(); } // check file exsits and it is a regular file if (!fs::is_regular(file)) { err.err("lexer", "<"+file+"> is not a regular file"); err.chkerr(); } // load filename = file; std::ifstream in(file, std::ios::binary); if (in.fail()) { err.err("lexer", "failed to open <" + file + ">"); res = ""; return; } err.load(file); std::stringstream ss; ss << in.rdbuf(); res = ss.str(); } tok lexer::get_type(const std::string& str) { // search token type from mapper // if cannot find, just return null return token_mapper.count(str)? token_mapper.at(str):tok::tk_null; } std::string lexer::utf8_gen() { std::string str = ""; while(ptr<res.size() && res[ptr]<0) { std::string tmp = ""; u32 nbytes = util::utf8_hdchk(res[ptr]); if (!nbytes) { ++ptr; ++column; continue; } tmp += res[ptr++]; for(u32 i = 0; i<nbytes; ++i, ++ptr) { if (ptr<res.size() && (res[ptr]&0xc0)==0x80) { tmp += res[ptr]; } } // utf8 character's total length is 1+nbytes if (tmp.length()!=1+nbytes) { ++column; std::string utf_info = "0x" + util::char_to_hex(tmp[0]); for(u32 i = 1; i<tmp.size(); ++i) { utf_info += " 0x" + util::char_to_hex(tmp[i]); } err.err("lexer", {line, column-1, line, column, filename}, "invalid utf-8 <"+utf_info+">" ); ++invalid_char; } str += tmp; // may have some problems because not all the unicode takes 2 space column += 2; } return str; } token lexer::id_gen() { u64 begin_line = line; u64 begin_column = column; std::string str = ""; while(ptr<res.size() && (is_id(res[ptr]) || is_dec(res[ptr]))) { if (res[ptr]<0) { // utf-8 str += utf8_gen(); } else { // ascii str += res[ptr++]; ++column; } } tok type = get_type(str); return { {begin_line, begin_column, line, column, filename}, (type!=tok::tk_null)? type:tok::tk_id, str }; } token lexer::num_gen() { u64 begin_line = line; u64 begin_column = column; // generate hex number if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='x') { std::string str = "0x"; ptr += 2; while(ptr<res.size() && is_hex(res[ptr])) { str += res[ptr++]; } column += str.length(); // "0x" if (str.length()<3) { err.err("lexer", {begin_line, begin_column, line, column, filename}, "invalid number `"+str+"`" ); } return { {begin_line, begin_column, line, column, filename}, tok::tk_num, str }; } else if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='o') { // generate oct number std::string str = "0o"; ptr += 2; while(ptr<res.size() && is_oct(res[ptr])) { str += res[ptr++]; } bool erfmt = false; while(ptr<res.size() && (is_dec(res[ptr]) || is_hex(res[ptr]))) { erfmt = true; str += res[ptr++]; } column += str.length(); if (str.length()==2 || erfmt) { err.err("lexer", {begin_line, begin_column, line, column, filename}, "invalid number `"+str+"`" ); } return { {begin_line, begin_column, line, column, filename}, tok::tk_num, str }; } // generate dec number // dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*) std::string str = ""; while(ptr<res.size() && is_dec(res[ptr])) { str += res[ptr++]; } if (ptr<res.size() && res[ptr]=='.') { str += res[ptr++]; while(ptr<res.size() && is_dec(res[ptr])) { str += res[ptr++]; } // "xxxx." is not a correct number if (str.back()=='.') { column += str.length(); err.err("lexer", {begin_line, begin_column, line, column, filename}, "invalid number `"+str+"`" ); return { {begin_line, begin_column, line, column, filename}, tok::tk_num, "0" }; } } if (ptr<res.size() && (res[ptr]=='e' || res[ptr]=='E')) { str += res[ptr++]; if (ptr<res.size() && (res[ptr]=='-' || res[ptr]=='+')) { str += res[ptr++]; } while(ptr<res.size() && is_dec(res[ptr])) { str += res[ptr++]; } // "xxxe(-|+)" is not a correct number if (str.back()=='e' || str.back()=='E' || str.back()=='-' || str.back()=='+') { column += str.length(); err.err("lexer", {begin_line, begin_column, line, column, filename}, "invalid number `"+str+"`" ); return { {begin_line, begin_column, line, column, filename}, tok::tk_num, "0" }; } } column += str.length(); return { {begin_line, begin_column, line, column, filename}, tok::tk_num, str }; } token lexer::str_gen() { u64 begin_line = line; u64 begin_column = column; std::string str = ""; const char begin = res[ptr]; ++column; while(++ptr<res.size() && res[ptr]!=begin) { ++column; if (res[ptr]=='\n') { column = 0; ++line; } if (res[ptr]=='\\' && ptr+1<res.size()) { ++column; ++ptr; switch(res[ptr]) { case '0': str += '\0'; break; case 'a': str += '\a'; break; case 'b': str += '\b'; break; case 'e': str += '\033'; break; case 't': str += '\t'; break; case 'n': str += '\n'; break; case 'v': str += '\v'; break; case 'f': str += '\f'; break; case 'r': str += '\r'; break; case '?': str += '\?'; break; case '\\':str += '\\'; break; case '\'':str += '\''; break; case '\"':str += '\"'; break; default: str += res[ptr];break; } if (res[ptr]=='\n') { column = 0; ++line; } continue; } str += res[ptr]; } // check if this string ends with a " or ' if (ptr++>=res.size()) { err.err("lexer", {begin_line, begin_column, line, column, filename}, "get EOF when generating string" ); return { {begin_line, begin_column, line, column, filename}, tok::tk_str, str }; } ++column; // if is not utf8, 1+utf8_hdchk should be 1 if (begin=='`' && str.length()!=1+util::utf8_hdchk(str[0])) { err.err("lexer", {begin_line, begin_column, line, column, filename}, "\'`\' is used for string including one character" ); } return { {begin_line, begin_column, line, column, filename}, tok::tk_str, str }; } token lexer::quesmark_gen() { u64 begin_line = line; u64 begin_column = column; std::string str(1, res[ptr]); ++column; ++ptr; if (ptr < res.size() && (res[ptr]=='?' || res[ptr]=='.')) { str += res[ptr]; ++column; ++ptr; } return { {begin_line, begin_column, line, column, filename}, get_type(str), str }; } token lexer::single_opr() { u64 begin_line = line; u64 begin_column = column; std::string str(1, res[ptr]); ++column; tok type = get_type(str); if (type==tok::tk_null) { err.err("lexer", {begin_line, begin_column, line, column, filename}, "invalid operator `"+str+"`" ); } ++ptr; return {{begin_line, begin_column, line, column, filename}, type, str}; } token lexer::dots() { u64 begin_line = line; u64 begin_column = column; std::string str = "."; if (ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.') { str += ".."; } ptr += str.length(); column += str.length(); return {{begin_line, begin_column, line, column, filename}, get_type(str), str}; } token lexer::calc_opr() { u64 begin_line = line; u64 begin_column = column; // get calculation operator std::string str(1, res[ptr++]); if (ptr<res.size() && res[ptr]=='=') { str += res[ptr++]; } column += str.length(); return {{begin_line, begin_column, line, column, filename}, get_type(str), str}; } const error& lexer::scan(const std::string& file) { line = 1; column = 0; ptr = 0; toks = {}; open(file); while(ptr<res.size()) { while(ptr<res.size() && skip(res[ptr])) { // these characters will be ignored, and '\n' will cause ++line ++column; if (res[ptr++]=='\n') { ++line; column = 0; } } if (ptr>=res.size()) { break; } if (is_id(res[ptr])) { toks.push_back(id_gen()); } else if (is_dec(res[ptr])) { toks.push_back(num_gen()); } else if (is_str(res[ptr])) { toks.push_back(str_gen()); } else if (is_quesmark(res[ptr])) { toks.push_back(quesmark_gen()); } else if (is_single_opr(res[ptr])) { toks.push_back(single_opr()); } else if (res[ptr]=='.') { toks.push_back(dots()); } else if (is_calc_opr(res[ptr])) { toks.push_back(calc_opr()); } else if (res[ptr]=='#') { skip_note(); } else { err_char(); } if (invalid_char>10) { err.err("lexer", "too many invalid characters, stop"); break; } } if (toks.size()) { // eof token's location is the last token's location toks.push_back({toks.back().loc, tok::tk_eof, "<eof>"}); } else { // if token sequence is empty, generate a default location toks.push_back({ {line, column, line, column, filename}, tok::tk_eof, "<eof>" }); } res = ""; return err; } }