Comma: lib/parser/Lexer.cpp Source File

00001 //===-- parser/Lexer.cpp -------------------------------------- -*- C++ -*-===//
00002 //
00003 // This file is distributed under the MIT license.  See LICENSE.txt for details.
00004 //
00005 // Copyright (C) 2008-2010, Stephen Wilson
00006 //
00007 //===----------------------------------------------------------------------===//
00008 
00009 #include "comma/parser/Lexer.h"
00010 #include <cstring>
00011 
00012 using namespace comma;
00013 
00014 Lexer::Lexer(TextProvider &txtProvider, Diagnostic &diag)
00015     : txtProvider(txtProvider),
00016       diagnostic(diag),
00017       currentIter(txtProvider.begin()),
00018       errorCount(0),
00019       scanningAborted(false),
00020       index(0)
00021 { }
00022 
00023 std::string Lexer::Token::getString() const
00024 {
00025     return Lexer::tokenString(*this);
00026 }
00027 
00028 const char *Lexer::tokenString(const Code code)
00029 {
00030     const char *result;
00031 
00032     switch (code) {
00033     default:
00034         result = 0;
00035         break;
00036 
00037 #define RESERVED(NAME, STRING) case TKN_ ## NAME: result = STRING; break;
00038 #define GLYPH(NAME, STRING)    case TKN_ ## NAME: result = STRING; break;
00039 #include "comma/parser/Tokens.def"
00040 #undef RESERVED
00041 #undef GLYPH
00042     }
00043 
00044     return result;
00045 }
00046 
00047 std::string Lexer::tokenString(const Token &token)
00048 {
00049     Code code = token.getCode();
00050 
00051     switch (code) {
00052     default:
00053         return std::string(tokenString(code));
00054         break;
00055 
00056     case TKN_IDENTIFIER:
00057     case TKN_INTEGER:
00058     case TKN_STRING:
00059     case TKN_CHARACTER:
00060         return std::string(token.getRep(), token.getLength());
00061     }
00062 }
00063 
00064 bool Lexer::isDecimalDigit(unsigned c)
00065 {
00066     return ('0' <= c && c <= '9');
00067 }
00068 
00069 bool Lexer::isInitialIdentifierChar(unsigned c)
00070 {
00071     if (('a' <= c && c <= 'z') ||
00072         ('A' <= c && c <= 'Z') ||
00073         (c == '%') || (c == '_'))
00074         return true;
00075 
00076     return false;
00077 }
00078 
00079 bool Lexer::isInnerIdentifierChar(unsigned c)
00080 {
00081     return isInitialIdentifierChar(c) || isDecimalDigit(c) || c == '?';
00082 }
00083 
00084 bool Lexer::isWhitespace(unsigned c)
00085 {
00086     return (c == ' ') || (c == '\t') || (c == '\n');
00087 }
00088 
00089 Location Lexer::currentLocation() const
00090 {
00091     return txtProvider.getLocation(currentIter);
00092 }
00093 
00094 // Something of a fundamental function, since all characters are gathered from
00095 // the underlying stream via this routine.
00096 unsigned Lexer::readStream()
00097 {
00098     unsigned c = *currentIter;
00099     ++currentIter;
00100 
00101     // Ensure that carriage returns and DOS style newline sequences are
00102     // canonicalized into single newline character codes.
00103     switch (c) {
00104 
00105     case '\r':
00106         if (*currentIter == '\n')
00107             ++currentIter;
00108     case '\n':
00109         return '\n';
00110     }
00111 
00112     return c;
00113 }
00114 
00115 unsigned Lexer::peekStream()
00116 {
00117     unsigned c = *currentIter;
00118 
00119     if (c == '\r')
00120         return '\n';
00121 
00122     return c;
00123 }
00124 
00125 void Lexer::ungetStream()
00126 {
00127     --currentIter;
00128 }
00129 
00130 void Lexer::ignoreStream()
00131 {
00132     readStream();
00133 }
00134 
00135 bool Lexer::eatComment()
00136 {
00137     unsigned c = peekStream();
00138 
00139     if (c == '-') {
00140         ignoreStream();
00141         if (peekStream() == '-') {
00142             // Loop until either a newline or the input stream is
00143             // exhausted.
00144             for (;;) {
00145                 c = readStream();
00146                 if (c == '\n' || c == 0)
00147                     return true;
00148             }
00149         }
00150         else {
00151             ungetStream();
00152             return false;
00153         }
00154     }
00155     return false;
00156 }
00157 
00158 bool Lexer::eatWhitespace()
00159 {
00160     unsigned c = peekStream();
00161 
00162     if (isWhitespace(c)) {
00163         do {
00164             ignoreStream();
00165         } while (isWhitespace(c = peekStream()));
00166         return true;
00167     }
00168     return false;
00169 }
00170 
00171 void Lexer::emitToken(Code code,
00172                       const TextIterator &start, const TextIterator &end)
00173 {
00174     Location    loc    = txtProvider.getLocation(start);
00175     const char *string = &start;
00176     unsigned    length = &end - &start;
00177     *targetToken = Token(code, loc, string, length);
00178 }
00179 
00180 void Lexer::emitToken(Code code, Location loc)
00181 {
00182     *targetToken = Token(code, loc, 0, 0);
00183 }
00184 
00185 void Lexer::emitStringToken(const TextIterator &start, const TextIterator &end)
00186 {
00187     emitToken(TKN_STRING, start, end);
00188 }
00189 
00190 void Lexer::emitIntegerToken(const TextIterator &start, const TextIterator &end)
00191 {
00192     emitToken(TKN_INTEGER, start, end);
00193 }
00194 
00195 void Lexer::emitIdentifierToken(const TextIterator &start, const TextIterator &end)
00196 {
00197     emitToken(TKN_IDENTIFIER, start, end);
00198 }
00199 
00200 void Lexer::emitCharacterToken(const TextIterator &start, const TextIterator &end)
00201 {
00202     emitToken(TKN_CHARACTER, start, end);
00203 }
00204 
00205 Lexer::Code Lexer::getTokenCode(TextIterator &start, TextIterator &end) const
00206 {
00207     Code code = UNUSED_ID;
00208     const char *str = &start;
00209     unsigned length = &end - &start;
00210 
00211     switch (length) {
00212     case 1:
00213         if (strncmp(str, "%", length) == 0)
00214             code = TKN_PERCENT;
00215         break;
00216 
00217     case 2:
00218         if (strncmp(str, "is", length) == 0)
00219             code = TKN_IS;
00220         else if (strncmp(str, "if", length) == 0)
00221             code = TKN_IF;
00222         else if (strncmp(str, "in", length) == 0)
00223             code = TKN_IN;
00224         else if (strncmp(str, "of", length) == 0)
00225             code = TKN_OF;
00226         else if (strncmp(str, "or", length) == 0)
00227             code = TKN_OR;
00228         break;
00229 
00230     case 3:
00231         if (strncmp(str, "end", length) == 0)
00232             code = TKN_END;
00233         else if (strncmp(str, "out", length) == 0)
00234             code = TKN_OUT;
00235         else if (strncmp(str, "add", length) == 0)
00236             code = TKN_ADD;
00237         else if (strncmp(str, "inj", length) == 0)
00238             code = TKN_INJ;
00239         else if (strncmp(str, "prj", length) == 0)
00240             code = TKN_PRJ;
00241         else if (strncmp(str, "and", length) == 0)
00242             code = TKN_AND;
00243         else if (strncmp(str, "mod", length) == 0)
00244             code = TKN_MOD;
00245         else if (strncmp(str, "rem", length) == 0)
00246             code = TKN_REM;
00247         else if (strncmp(str, "for", length) == 0)
00248             code = TKN_FOR;
00249         else if (strncmp(str, "not", length) == 0)
00250             code = TKN_NOT;
00251         else if (strncmp(str, "xor", length) == 0)
00252             code = TKN_XOR;
00253         else if (strncmp(str, "new", length) == 0)
00254             code = TKN_NEW;
00255         else if (strncmp(str, "all", length) == 0)
00256             code = TKN_ALL;
00257         break;
00258 
00259     case 4:
00260         if (strncmp(str, "else", length) == 0)
00261             code = TKN_ELSE;
00262         else if (strncmp(str, "loop", length) == 0)
00263             code = TKN_LOOP;
00264         else if (strncmp(str, "then", length) == 0)
00265             code = TKN_THEN;
00266         else if (strncmp(str, "with", length) == 0)
00267             code = TKN_WITH;
00268         else if (strncmp(str, "type", length) == 0)
00269             code = TKN_TYPE;
00270         else if (strncmp(str, "when", length) == 0)
00271             code = TKN_WHEN;
00272         else if (strncmp(str, "null", length) == 0)
00273             code = TKN_NULL;
00274         break;
00275 
00276     case 5:
00277         if (strncmp(str, "begin", length) == 0)
00278             code = TKN_BEGIN;
00279         else if (strncmp(str, "elsif", length) == 0)
00280             code = TKN_ELSIF;
00281         else if (strncmp(str, "while", length) == 0)
00282             code = TKN_WHILE;
00283         else if (strncmp(str, "range", length) == 0)
00284             code = TKN_RANGE;
00285         else if (strncmp(str, "array", length) == 0)
00286             code = TKN_ARRAY;
00287         else if (strncmp(str, "raise", length) == 0)
00288             code = TKN_RAISE;
00289         break;
00290 
00291     case 6:
00292         if (strncmp(str, "domain", length) == 0)
00293             code = TKN_DOMAIN;
00294         else if (strncmp(str, "return", length) == 0)
00295             code = TKN_RETURN;
00296         else if (strncmp(str, "import", length) == 0)
00297             code = TKN_IMPORT;
00298         else if (strncmp(str, "pragma", length) == 0)
00299             code = TKN_PRAGMA;
00300         else if (strncmp(str, "others", length) == 0)
00301             code = TKN_OTHERS;
00302         else if (strncmp(str, "record", length) == 0)
00303             code = TKN_RECORD;
00304         else if (strncmp(str, "access", length) == 0)
00305             code = TKN_ACCESS;
00306         break;
00307 
00308     case 7:
00309         if (strncmp(str, "carrier", length) == 0)
00310             code = TKN_CARRIER;
00311         else if (strncmp(str, "declare", length) == 0)
00312             code = TKN_DECLARE;
00313         else if (strncmp(str, "generic", length) == 0)
00314             code = TKN_GENERIC;
00315         else if (strncmp(str, "subtype", length) == 0)
00316             code = TKN_SUBTYPE;
00317         else if (strncmp(str, "reverse", length) == 0)
00318             code = TKN_REVERSE;
00319         else if (strncmp(str, "renames", length) == 0)
00320             code = TKN_RENAMES;
00321         break;
00322 
00323     case 8:
00324         if (strncmp(str, "function", length) == 0)
00325             code = TKN_FUNCTION;
00326         else if (strncmp(str, "abstract", length) == 0)
00327             code = TKN_ABSTRACT;
00328         break;
00329 
00330     case 9:
00331         if (strncmp(str, "procedure", length) == 0)
00332             code = TKN_PROCEDURE;
00333         else if (strncmp(str, "signature", length) == 0)
00334             code = TKN_SIGNATURE;
00335         else if (strncmp(str, "exception", length) == 0)
00336             code = TKN_EXCEPTION;
00337         break;
00338     }
00339     return code;
00340 }
00341 
00342 void Lexer::diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2)
00343 {
00344     if (c1 == '_' && c2 == '_') {
00345         report(diag::CONSECUTIVE_UNDERSCORE);
00346         do {
00347             ignoreStream();
00348         } while ((c2 = peekStream()) == '_');
00349     }
00350 }
00351 
00352 bool Lexer::scanWord()
00353 {
00354     TextIterator start = currentIter;
00355     unsigned c1, c2;
00356 
00357     if (isInitialIdentifierChar(c1 = peekStream())) {
00358         do {
00359             ignoreStream();
00360             c2 = peekStream();
00361             diagnoseConsecutiveUnderscores(c1, c2);
00362         } while (isInnerIdentifierChar(c1 = c2));
00363 
00364         Code code = getTokenCode(start, currentIter);
00365 
00366         if (code == UNUSED_ID)
00367             emitIdentifierToken(start, currentIter);
00368         else
00369             emitToken(code, txtProvider.getLocation(start));
00370         return true;
00371     }
00372     return false;
00373 }
00374 
00375 bool Lexer::scanGlyph()
00376 {
00377     Location loc = currentLocation();
00378     unsigned c = readStream();
00379     Code code  = UNUSED_ID;
00380 
00381     switch (c) {
00382     case '(':
00383         code = TKN_LPAREN;
00384         break;
00385 
00386     case ')':
00387         code = TKN_RPAREN;
00388         break;
00389 
00390     case ';':
00391         code = TKN_SEMI;
00392         break;
00393 
00394     case '.':
00395         switch (peekStream()) {
00396         case '.':
00397             ignoreStream();
00398             code = TKN_DDOT;
00399             break;
00400 
00401         default:
00402             code = TKN_DOT;
00403         }
00404         break;
00405 
00406     case ':':
00407         switch (peekStream()) {
00408         case '=':
00409             ignoreStream();
00410             code = TKN_ASSIGN;
00411             break;
00412 
00413         default:
00414             code = TKN_COLON;
00415         }
00416         break;
00417 
00418     case ',':
00419         code = TKN_COMMA;
00420         break;
00421 
00422     case '=':
00423         switch (peekStream()) {
00424         default:
00425             code = TKN_EQUAL;
00426             break;
00427 
00428         case '>':
00429             ignoreStream();
00430             code = TKN_RDARROW;
00431             break;
00432         }
00433         break;
00434 
00435     case '<':
00436         switch (peekStream()) {
00437         default:
00438             code = TKN_LESS;
00439             break;
00440 
00441         case '=':
00442             ignoreStream();
00443             code = TKN_LEQ;
00444             break;
00445 
00446         case '>':
00447             ignoreStream();
00448             code = TKN_DIAMOND;
00449         }
00450         break;
00451 
00452     case '>':
00453         switch (peekStream()) {
00454         default:
00455             code = TKN_GREAT;
00456             break;
00457 
00458         case '=':
00459             ignoreStream();
00460             code = TKN_GEQ;
00461             break;
00462         }
00463         break;
00464 
00465     case '+':
00466         code = TKN_PLUS;
00467         break;
00468 
00469     case '-':
00470         code = TKN_MINUS;
00471         break;
00472 
00473     case '*':
00474         switch (peekStream()) {
00475         case '*':
00476             ignoreStream();
00477             code = TKN_POW;
00478             break;
00479 
00480         default:
00481             code = TKN_STAR;
00482         }
00483         break;
00484 
00485     case '/':
00486         switch (peekStream()) {
00487         case '=':
00488             ignoreStream();
00489             code = TKN_NEQUAL;
00490             break;
00491 
00492         default:
00493             code = TKN_FSLASH;
00494         }
00495         break;
00496 
00497     case '&':
00498         code = TKN_AMPER;
00499         break;
00500 
00501     case '@':
00502         code = TKN_AT;
00503         break;
00504 
00505     case '|':
00506         code = TKN_BAR;
00507         break;
00508     }
00509 
00510     if (code == UNUSED_ID) {
00511         ungetStream();
00512         return false;
00513     }
00514 
00515     emitToken(code, loc);
00516     return true;
00517 }
00518 
00519 bool Lexer::scanEscape()
00520 {
00521     Location loc = currentLocation();
00522     unsigned c;
00523 
00524     switch (c = readStream()) {
00525     case '\\': break;
00526     case '"' : break;
00527     case '\'': break;
00528     case 't' : break;
00529     case 'n' : break;
00530     case 'r' : break;
00531     case 'b' : break;
00532 
00533     case 0:
00534         // Premature end of stream.  We let this condition be picked up by the
00535         // caller.
00536         ungetStream();
00537         return false;
00538 
00539     default:
00540         // Illegal escape sequence.
00541         report(loc, diag::ILLEGAL_ESCAPE) << (char)c;
00542         return false;
00543     }
00544     return true;
00545 }
00546 
00547 bool Lexer::scanCharacter()
00548 {
00549     TextIterator start = currentIter;
00550     Location loc = currentLocation();
00551     unsigned c;
00552 
00553     if (peekStream() == '\'') {
00554         ignoreStream();
00555         c = readStream();
00556 
00557         if (c == '\'') {
00558             // Empty enumeration literal.  This is not valid.  Consume and
00559             // report.
00560             report(loc, diag::EMPTY_CHARACTER_LITERAL);
00561             emitCharacterToken(start, currentIter);
00562             return true;
00563         }
00564 
00565         if (peekStream() != '\'') {
00566             // If the character is not terminated, this must be an attribute
00567             // selector.  Unget the current character and return a quote token.
00568             ungetStream();
00569             emitToken(TKN_QUOTE, loc);
00570             return true;
00571         }
00572 
00573         // Special case for the character representing a left paren.  We need to
00574         // deal with the special case of a qualified expression containing a
00575         // character.  Take "Character'('x')" or "String'('x', 'y')" for
00576         // example.  We handle this oddball case by checking if another quote is
00577         // two characters away.
00578         if (c == '(') {
00579             TextIterator cursor = currentIter;
00580             if (*++cursor && *++cursor == '\'') {
00581                 ungetStream();
00582                 emitToken(TKN_QUOTE, loc);
00583                 return true;
00584             }
00585         }
00586 
00587         // Otherwise we have a character literal.
00588         //
00589         // FIXME: Ensure the character belongs to the standard character set.
00590         ignoreStream();
00591         emitCharacterToken(start, currentIter);
00592         return true;
00593     }
00594     return false;
00595 }
00596 
00597 bool Lexer::scanString()
00598 {
00599     TextIterator start = currentIter;
00600     Location loc = currentLocation();
00601     unsigned c;
00602 
00603     if (peekStream() == '"') {
00604         ignoreStream();
00605 
00606         for (;;) {
00607             switch (c = readStream()) {
00608             case '\\':
00609                 // Note that if scanning of the escape fails, we simply do not
00610                 // accumulate the offending sequence and continue scanning.
00611                 scanEscape();
00612                 break;
00613 
00614             case 0:
00615                 // Premature end of stream.  Form the string literal from all
00616                 // tokens accumulated thus far.
00617                 report(loc, diag::UNTERMINATED_STRING);
00618                 emitStringToken(start, currentIter);
00619                 return true;
00620 
00621             case '\n':
00622                 // Embedded newline.
00623                 report(loc, diag::NEWLINE_IN_STRING_LIT);
00624                 emitStringToken(start, currentIter);
00625                 return true;
00626 
00627             case '"':
00628                 // End of string literal.
00629                 emitStringToken(start, currentIter);
00630                 return true;
00631             }
00632         }
00633     }
00634     return false;
00635 }
00636 
00637 bool Lexer::scanNumeric()
00638 {
00639     Location loc = currentLocation();
00640     TextIterator start = currentIter;
00641     unsigned c = peekStream();
00642 
00643     if (isDecimalDigit(c)) {
00644         ignoreStream();
00645 
00646         // Decimal literals cannot have a leading zero (except for the zero
00647         // literal, of course).  When we spot such a malformed integer, emit a
00648         // diagnostic and drop the leading zeros.
00649         if (c == '0' && isDecimalDigit(peekStream())) {
00650             report(loc, diag::LEADING_ZERO_IN_INTEGER_LIT);
00651 
00652             while (peekStream() == '0') ignoreStream();
00653 
00654             // Check if we have a string of zeros.  Simply return the zero token
00655             // in such a case.  Otherwise, continue scanning normally.
00656             if (!isDecimalDigit(peekStream())) {
00657                 TextIterator end = start;
00658                 emitIntegerToken(start, ++end);
00659                 return true;
00660             }
00661             else c = readStream();
00662         }
00663 
00664         for (;;) {
00665             c = readStream();
00666 
00667             if (isDecimalDigit(c) || c == '_')
00668                 continue;
00669             else {
00670                 ungetStream();
00671                 break;
00672             }
00673         }
00674         emitIntegerToken(start, currentIter);
00675         return true;
00676     }
00677     return false;
00678 }
00679 
00680 void Lexer::beginExcursion()
00681 {
00682     positionStack.push_back(index);
00683 }
00684 
00685 void Lexer::endExcursion()
00686 {
00687     index = positionStack.back();
00688     positionStack.pop_back();
00689 }
00690 
00691 void Lexer::forgetExcursion()
00692 {
00693     unsigned saved_index = positionStack.back();
00694     positionStack.pop_back();
00695 
00696     if (positionStack.empty()) {
00697         assert(saved_index == 0 && "index/position mismatch!");
00698         ((void)saved_index);
00699         tokens.clear();
00700     }
00701 }
00702 
00703 void Lexer::peek(Token &tkn, unsigned n)
00704 {
00705     unsigned numTokens = tokens.size();
00706 
00707     if (index + n < numTokens) {
00708         tkn = tokens[index + n];
00709         return;
00710     }
00711 
00712     unsigned tokensNeeded = index + n - numTokens;
00713     targetToken = &tkn;
00714     for (unsigned i = 0; i <= tokensNeeded; ++i) {
00715         scanToken();
00716         if (targetToken->getCode() != TKN_EOT)
00717             tokens.push_back(*targetToken);
00718     }
00719 }
00720 
00721 void Lexer::scan(Token &tkn)
00722 {
00723     unsigned numTokens = tokens.size();
00724 
00725     // Check if we have a cached token to return.
00726     if (index < numTokens) {
00727         tkn = tokens[index++];
00728         return;
00729     }
00730 
00731     // Clear the token buffer if it is not empty and we are not in an excursion.
00732     if (numTokens && positionStack.empty()) {
00733         tokens.clear();
00734         index = 0;
00735     }
00736 
00737     targetToken = &tkn;
00738 
00739     scanToken();
00740 
00741     // Save the token if we are in an excursion and it is not EOT.
00742     if (!positionStack.empty() && targetToken->getCode() != TKN_EOT) {
00743         index++;
00744         tokens.push_back(*targetToken);
00745     }
00746 }
00747 
00748 void Lexer::scanToken()
00749 {
00750     for (;;) {
00751         eatWhitespace();
00752         while (eatComment()) eatWhitespace();
00753 
00754         if (peekStream() == 0 || scanningAborted) {
00755             emitToken(TKN_EOT, Location());
00756             return;
00757         }
00758 
00759         if (scanWord())      return;
00760         if (scanGlyph())     return;
00761         if (scanString())    return;
00762         if (scanNumeric())   return;
00763         if (scanCharacter()) return;
00764 
00765         // For invalid character data emit a diagnostic and abort the scan.
00766         //
00767         // FIXME: There should be an isSourceChar function to check if the
00768         // character belongs to the source character set.  Scanning could just
00769         // skip legal characters.  Characters which do not fall into the
00770         // expected character set should likely have their hex value printed.
00771         report(diag::INVALID_CHARACTER) << static_cast<char>(peekStream());
00772         ignoreStream();
00773         abortScanning();
00774     }
00775 }
00776