diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Lex/Lexer.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Lex/Lexer.cpp | 470 |
1 files changed, 318 insertions, 152 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp index 9958287ba474..c071455da662 100644 --- a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp +++ b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp @@ -29,6 +29,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Lex/CodeCompletionHandler.h" #include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/LiteralSupport.h" #include "clang/Lex/Preprocessor.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" @@ -93,6 +94,10 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr, // Start of the file is a start of line. IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; + + HasLeadingSpace = false; + HasLeadingEmptyMacro = false; // We are not after parsing a #. ParsingPreprocessorDirective = false; @@ -430,7 +435,8 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, /// \returns true if there was a failure, false on success. bool Lexer::getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, - const LangOptions &LangOpts) { + const LangOptions &LangOpts, + bool IgnoreWhiteSpace) { // TODO: this could be special cased for common tokens like identifiers, ')', // etc to make this faster, if it mattered. Just look at StrData[0] to handle // all obviously single-char tokens. This could use @@ -448,7 +454,7 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result, const char *StrData = Buffer.data()+LocInfo.second; - if (isWhitespace(StrData[0])) + if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) return true; // Create a lexer starting at the beginning of this token. @@ -798,14 +804,10 @@ bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, SourceLocation *MacroBegin) { assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); - std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); - // FIXME: If the token comes from the macro token paste operator ('##') - // this function will always return false; - if (infoLoc.second > 0) - return false; // Does not point at the start of token. + SourceLocation expansionLoc; + if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) + return false; - SourceLocation expansionLoc = - SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); if (expansionLoc.isFileID()) { // No other macro expansions, this is the first. if (MacroBegin) @@ -829,16 +831,11 @@ bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, if (tokLen == 0) return false; - FileID FID = SM.getFileID(loc); - SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); - if (SM.isInFileID(afterLoc, FID)) - return false; // Still in the same FileID, does not point to the last token. - - // FIXME: If the token comes from the macro token paste operator ('##') - // or the stringify operator ('#') this function will always return false; + SourceLocation afterLoc = loc.getLocWithOffset(tokLen); + SourceLocation expansionLoc; + if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) + return false; - SourceLocation expansionLoc = - SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); if (expansionLoc.isFileID()) { // No other macro expansions. if (MacroEnd) @@ -916,25 +913,25 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, return makeRangeFromFileLocs(Range, SM, LangOpts); } - FileID FID; - unsigned BeginOffs; - llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); - if (FID.isInvalid()) + bool Invalid = false; + const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), + &Invalid); + if (Invalid) return CharSourceRange(); - unsigned EndOffs; - if (!SM.isInFileID(End, FID, &EndOffs) || - BeginOffs > EndOffs) - return CharSourceRange(); + if (BeginEntry.getExpansion().isMacroArgExpansion()) { + const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), + &Invalid); + if (Invalid) + return CharSourceRange(); - const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); - const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); - if (Expansion.isMacroArgExpansion() && - Expansion.getSpellingLoc().isFileID()) { - SourceLocation SpellLoc = Expansion.getSpellingLoc(); - Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs)); - Range.setEnd(SpellLoc.getLocWithOffset(EndOffs)); - return makeRangeFromFileLocs(Range, SM, LangOpts); + if (EndEntry.getExpansion().isMacroArgExpansion() && + BeginEntry.getExpansion().getExpansionLocStart() == + EndEntry.getExpansion().getExpansionLocStart()) { + Range.setBegin(SM.getImmediateSpellingLoc(Begin)); + Range.setEnd(SM.getImmediateSpellingLoc(End)); + return makeFileCharRange(Range, SM, LangOpts); + } } return CharSourceRange(); @@ -1369,26 +1366,42 @@ void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { BufferPtr += Bytes; if (BufferPtr > BufferEnd) BufferPtr = BufferEnd; + // FIXME: What exactly does the StartOfLine bit mean? There are two + // possible meanings for the "start" of the line: the first token on the + // unexpanded line, or the first token on the expanded line. IsAtStartOfLine = StartOfLine; + IsAtPhysicalStartOfLine = StartOfLine; } static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { - if (LangOpts.CPlusPlus11 || LangOpts.C11) - return isCharInSet(C, C11AllowedIDChars); - else if (LangOpts.CPlusPlus) - return isCharInSet(C, CXX03AllowedIDChars); - else - return isCharInSet(C, C99AllowedIDChars); + if (LangOpts.CPlusPlus11 || LangOpts.C11) { + static const llvm::sys::UnicodeCharSet C11AllowedIDChars( + C11AllowedIDCharRanges); + return C11AllowedIDChars.contains(C); + } else if (LangOpts.CPlusPlus) { + static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( + CXX03AllowedIDCharRanges); + return CXX03AllowedIDChars.contains(C); + } else { + static const llvm::sys::UnicodeCharSet C99AllowedIDChars( + C99AllowedIDCharRanges); + return C99AllowedIDChars.contains(C); + } } static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { assert(isAllowedIDChar(C, LangOpts)); - if (LangOpts.CPlusPlus11 || LangOpts.C11) - return !isCharInSet(C, C11DisallowedInitialIDChars); - else if (LangOpts.CPlusPlus) + if (LangOpts.CPlusPlus11 || LangOpts.C11) { + static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( + C11DisallowedInitialIDCharRanges); + return !C11DisallowedInitialIDChars.contains(C); + } else if (LangOpts.CPlusPlus) { return true; - else - return !isCharInSet(C, C99DisallowedInitialIDChars); + } else { + static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( + C99DisallowedInitialIDCharRanges); + return !C99DisallowedInitialIDChars.contains(C); + } } static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, @@ -1407,11 +1420,15 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CannotStartIdentifier }; - if (!isCharInSet(C, C99AllowedIDChars)) { + static const llvm::sys::UnicodeCharSet C99AllowedIDChars( + C99AllowedIDCharRanges); + static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( + C99DisallowedInitialIDCharRanges); + if (!C99AllowedIDChars.contains(C)) { Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) << Range << CannotAppearInIdentifier; - } else if (IsFirst && isCharInSet(C, C99DisallowedInitialIDChars)) { + } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) << Range << CannotStartIdentifier; @@ -1421,14 +1438,16 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, // Check C++98 compatibility. if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_unicode_id, Range.getBegin()) > DiagnosticsEngine::Ignored) { - if (!isCharInSet(C, CXX03AllowedIDChars)) { + static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( + CXX03AllowedIDCharRanges); + if (!CXX03AllowedIDChars.contains(C)) { Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) << Range; } } } -void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { +bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] unsigned Size; unsigned char C = *CurPtr++; @@ -1452,7 +1471,7 @@ FinishIdentifier: // If we are in raw mode, return this identifier raw. There is no need to // look up identifier information or attempt to macro expand it. if (LexingRawMode) - return; + return true; // Fill in Result.IdentifierInfo and update the token kind, // looking up the identifier in the identifier table. @@ -1461,9 +1480,9 @@ FinishIdentifier: // Finally, now that we know we have an identifier, pass this off to the // preprocessor, which may macro expand it or something. if (II->isHandleIdentifierCase()) - PP->HandleIdentifier(Result); + return PP->HandleIdentifier(Result); - return; + return true; } // Otherwise, $,\,? in identifier found. Enter slower path. @@ -1553,7 +1572,7 @@ bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { /// LexNumericConstant - Lex the remainder of a integer or floating point /// constant. From[-1] is the first character lexed. Return the end of the /// constant. -void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { +bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { unsigned Size; char C = getCharAndSize(CurPtr, Size); char PrevCh = 0; @@ -1587,15 +1606,29 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); } + // If we have a digit separator, continue. + if (C == '\'' && getLangOpts().CPlusPlus1y) { + unsigned NextSize; + char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); + if (isIdentifierBody(Next)) { + if (!isLexingRawMode()) + Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); + CurPtr = ConsumeChar(CurPtr, Size, Result); + return LexNumericConstant(Result, CurPtr); + } + } + // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::numeric_constant); Result.setLiteralData(TokStart); + return true; } /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes /// in C++11, or warn on a ud-suffix in C++98. -const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { +const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, + bool IsStringLiteral) { assert(getLangOpts().CPlusPlus); // Maximally munch an identifier. FIXME: UCNs. @@ -1615,9 +1648,41 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { // that does not start with an underscore is ill-formed. As a conforming // extension, we treat all such suffixes as if they had whitespace before // them. - if (C != '_') { + bool IsUDSuffix = false; + if (C == '_') + IsUDSuffix = true; + else if (IsStringLiteral && getLangOpts().CPlusPlus1y) { + // In C++1y, we need to look ahead a few characters to see if this is a + // valid suffix for a string literal or a numeric literal (this could be + // the 'operator""if' defining a numeric literal operator). + const unsigned MaxStandardSuffixLength = 3; + char Buffer[MaxStandardSuffixLength] = { C }; + unsigned Consumed = Size; + unsigned Chars = 1; + while (true) { + unsigned NextSize; + char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, + getLangOpts()); + if (!isIdentifierBody(Next)) { + // End of suffix. Check whether this is on the whitelist. + IsUDSuffix = (Chars == 1 && Buffer[0] == 's') || + NumericLiteralParser::isValidUDSuffix( + getLangOpts(), StringRef(Buffer, Chars)); + break; + } + + if (Chars == MaxStandardSuffixLength) + // Too long: can't be a standard suffix. + break; + + Buffer[Chars++] = Next; + Consumed += NextSize; + } + } + + if (!IsUDSuffix) { if (!isLexingRawMode()) - Diag(CurPtr, getLangOpts().MicrosoftMode ? + Diag(CurPtr, getLangOpts().MicrosoftMode ? diag::ext_ms_reserved_user_defined_literal : diag::ext_reserved_user_defined_literal) << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); @@ -1635,7 +1700,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { /// LexStringLiteral - Lex the remainder of a string literal, after having lexed /// either " or L" or u8" or u" or U". -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, +bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this string contain the \0 character? @@ -1659,14 +1724,15 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferPtr, diag::ext_unterminated_string); FormTokenWithChars(Result, CurPtr-1, tok::unknown); - return; + return true; } if (C == 0) { if (isCodeCompletionPoint(CurPtr-1)) { PP->CodeCompleteNaturalLanguage(); FormTokenWithChars(Result, CurPtr-1, tok::unknown); - return cutOffLexing(); + cutOffLexing(); + return true; } NulCharacter = CurPtr-1; @@ -1676,7 +1742,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, // If we are in C++11, lex the optional ud-suffix. if (getLangOpts().CPlusPlus) - CurPtr = LexUDSuffix(Result, CurPtr); + CurPtr = LexUDSuffix(Result, CurPtr, true); // If a nul character existed in the string, warn about it. if (NulCharacter && !isLexingRawMode()) @@ -1686,11 +1752,12 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); + return true; } /// LexRawStringLiteral - Lex the remainder of a raw string literal, after /// having lexed R", LR", u8R", uR", or UR". -void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, +bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, tok::TokenKind Kind) { // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: // Between the initial and final double quote characters of the raw string, @@ -1732,7 +1799,7 @@ void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, } FormTokenWithChars(Result, CurPtr, tok::unknown); - return; + return true; } // Save prefix and move CurPtr past it @@ -1753,23 +1820,24 @@ void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, Diag(BufferPtr, diag::err_unterminated_raw_string) << StringRef(Prefix, PrefixLen); FormTokenWithChars(Result, CurPtr-1, tok::unknown); - return; + return true; } } // If we are in C++11, lex the optional ud-suffix. if (getLangOpts().CPlusPlus) - CurPtr = LexUDSuffix(Result, CurPtr); + CurPtr = LexUDSuffix(Result, CurPtr, true); // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); + return true; } /// LexAngledStringLiteral - Lex the remainder of an angled string literal, /// after having lexed the '<' character. This is used for #include filenames. -void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { +bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { const char *NulCharacter = 0; // Does this string contain the \0 character? const char *AfterLessPos = CurPtr; char C = getAndAdvanceChar(CurPtr, Result); @@ -1784,7 +1852,7 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { // If the filename is unterminated, then it must just be a lone < // character. Return this as such. FormTokenWithChars(Result, AfterLessPos, tok::less); - return; + return true; } else if (C == 0) { NulCharacter = CurPtr-1; } @@ -1799,12 +1867,13 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); Result.setLiteralData(TokStart); + return true; } /// LexCharConstant - Lex the remainder of a character constant, after having /// lexed either ' or L' or u' or U'. -void Lexer::LexCharConstant(Token &Result, const char *CurPtr, +bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this character contain the \0 character? @@ -1819,7 +1888,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr, if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferPtr, diag::ext_empty_character); FormTokenWithChars(Result, CurPtr, tok::unknown); - return; + return true; } while (C != '\'') { @@ -1832,14 +1901,15 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr, if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferPtr, diag::ext_unterminated_char); FormTokenWithChars(Result, CurPtr-1, tok::unknown); - return; + return true; } if (C == 0) { if (isCodeCompletionPoint(CurPtr-1)) { PP->CodeCompleteNaturalLanguage(); FormTokenWithChars(Result, CurPtr-1, tok::unknown); - return cutOffLexing(); + cutOffLexing(); + return true; } NulCharacter = CurPtr-1; @@ -1849,7 +1919,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr, // If we are in C++11, lex the optional ud-suffix. if (getLangOpts().CPlusPlus) - CurPtr = LexUDSuffix(Result, CurPtr); + CurPtr = LexUDSuffix(Result, CurPtr, false); // If a nul character existed in the character, warn about it. if (NulCharacter && !isLexingRawMode()) @@ -1859,6 +1929,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr, const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); + return true; } /// SkipWhitespace - Efficiently skip over a series of whitespace characters. @@ -1866,11 +1937,14 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr, /// /// This method forms a token and returns true if KeepWhitespaceMode is enabled. /// -bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { +bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { // Whitespace - Skip it, then return the token after the whitespace. bool SawNewline = isVerticalWhitespace(CurPtr[-1]); - unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. + unsigned char Char = *CurPtr; + + // Skip consecutive spaces efficiently. while (1) { // Skip horizontal whitespace very aggressively. while (isHorizontalWhitespace(Char)) @@ -1886,7 +1960,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { return false; } - // ok, but handle newline. + // OK, but handle newline. SawNewline = true; Char = *++CurPtr; } @@ -1894,8 +1968,10 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { // If the client wants us to return whitespace, return it now. if (isKeepWhitespaceMode()) { FormTokenWithChars(Result, CurPtr, tok::unknown); - if (SawNewline) + if (SawNewline) { IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; + } // FIXME: The next token will not have LeadingSpace set. return true; } @@ -1905,8 +1981,10 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); - if (SawNewline) + if (SawNewline) { Result.setFlag(Token::StartOfLine); + TokAtPhysicalStartOfLine = true; + } BufferPtr = CurPtr; return false; @@ -1918,7 +1996,8 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { /// /// If we're in KeepCommentMode or any CommentHandler has inserted /// some tokens, this will store the first token and return true. -bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) { +bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { // If Line comments aren't explicitly enabled for this language, emit an // extension warning. if (!LangOpts.LineComment && !isLexingRawMode()) { @@ -2037,6 +2116,7 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) { // The next returned token is at the start of the line. Result.setFlag(Token::StartOfLine); + TokAtPhysicalStartOfLine = true; // No leading whitespace seen so far. Result.clearFlag(Token::LeadingSpace); BufferPtr = CurPtr; @@ -2147,7 +2227,8 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, /// /// If we're in KeepCommentMode or any CommentHandler has inserted /// some tokens, this will store the first token and return true. -bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { +bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, + bool &TokAtPhysicalStartOfLine) { // Scan one character past where we should, looking for a '/' character. Once // we find it, check to see if it was preceded by a *. This common // optimization helps people who like to put a lot of * characters in their @@ -2202,7 +2283,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { // Adjust the pointer to point directly after the first slash. It's // not necessary to set C here, it will be overwritten at the end of // the outer loop. - CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; + CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; goto FoundSlash; } CurPtr += 16; @@ -2298,7 +2379,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { // efficiently now. This is safe even in KeepWhitespaceMode because we would // have already returned above with the comment as a token. if (isHorizontalWhitespace(*CurPtr)) { - SkipWhitespace(Result, CurPtr+1); + SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); return false; } @@ -2404,10 +2485,28 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue // a pedwarn. - if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) - Diag(BufferEnd, LangOpts.CPlusPlus11 ? // C++11 [lex.phases] 2.2 p2 - diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof) - << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); + if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { + DiagnosticsEngine &Diags = PP->getDiagnostics(); + SourceLocation EndLoc = getSourceLocation(BufferEnd); + unsigned DiagID; + + if (LangOpts.CPlusPlus11) { + // C++11 [lex.phases] 2.2 p2 + // Prefer the C++98 pedantic compatibility warning over the generic, + // non-extension, user-requested "missing newline at EOF" warning. + if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_no_newline_eof, + EndLoc) != DiagnosticsEngine::Ignored) { + DiagID = diag::warn_cxx98_compat_no_newline_eof; + } else { + DiagID = diag::warn_no_newline_eof; + } + } else { + DiagID = diag::ext_no_newline_eof; + } + + Diag(BufferEnd, DiagID) + << FixItHint::CreateInsertion(EndLoc, "\n"); + } BufferPtr = CurPtr; @@ -2430,14 +2529,19 @@ unsigned Lexer::isNextPPTokenLParen() { // Save state that can be changed while lexing so that we can restore it. const char *TmpBufferPtr = BufferPtr; bool inPPDirectiveMode = ParsingPreprocessorDirective; + bool atStartOfLine = IsAtStartOfLine; + bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; + bool leadingSpace = HasLeadingSpace; Token Tok; - Tok.startToken(); - LexTokenInternal(Tok); + Lex(Tok); // Restore state that may have changed. BufferPtr = TmpBufferPtr; ParsingPreprocessorDirective = inPPDirectiveMode; + HasLeadingSpace = leadingSpace; + IsAtStartOfLine = atStartOfLine; + IsAtPhysicalStartOfLine = atPhysicalStartOfLine; // Restore the lexer back to non-skipping mode. LexingRawMode = false; @@ -2626,6 +2730,10 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, StartPtr = CurPtr; } + // Don't apply C family restrictions to UCNs in assembly mode + if (LangOpts.AsmPreprocessor) + return CodePoint; + // C99 6.4.3p2: A universal character name shall not specify a character whose // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or // 0060 (`), nor one in the range D800 through DFFF inclusive.) @@ -2670,19 +2778,22 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, return CodePoint; } -void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { +bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, + const char *CurPtr) { + static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( + UnicodeWhitespaceCharRanges); if (!isLexingRawMode() && !PP->isPreprocessedOutput() && - isCharInSet(C, UnicodeWhitespaceChars)) { + UnicodeWhitespaceChars.contains(C)) { Diag(BufferPtr, diag::ext_unicode_whitespace) << makeCharRange(*this, BufferPtr, CurPtr); Result.setFlag(Token::LeadingSpace); - if (SkipWhitespace(Result, CurPtr)) - return; // KeepWhitespaceMode - - return LexTokenInternal(Result); + return true; } + return false; +} +bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { if (!isLexingRawMode() && !ParsingPreprocessorDirective && !PP->isPreprocessedOutput()) { @@ -2711,22 +2822,59 @@ void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); BufferPtr = CurPtr; - return LexTokenInternal(Result); + return false; } // Otherwise, we have an explicit UCN or a character that's unlikely to show // up by accident. MIOpt.ReadToken(); FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; } +void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { + IsAtStartOfLine = Result.isAtStartOfLine(); + HasLeadingSpace = Result.hasLeadingSpace(); + HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); + // Note that this doesn't affect IsAtPhysicalStartOfLine. +} + +bool Lexer::Lex(Token &Result) { + // Start a new token. + Result.startToken(); + + // Set up misc whitespace flags for LexTokenInternal. + if (IsAtStartOfLine) { + Result.setFlag(Token::StartOfLine); + IsAtStartOfLine = false; + } + + if (HasLeadingSpace) { + Result.setFlag(Token::LeadingSpace); + HasLeadingSpace = false; + } + + if (HasLeadingEmptyMacro) { + Result.setFlag(Token::LeadingEmptyMacro); + HasLeadingEmptyMacro = false; + } + + bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; + IsAtPhysicalStartOfLine = false; + bool isRawLex = isLexingRawMode(); + (void) isRawLex; + bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); + // (After the LexTokenInternal call, the lexer might be destroyed.) + assert((returnedToken || !isRawLex) && "Raw lex must succeed"); + return returnedToken; +} /// LexTokenInternal - This implements a simple C family lexer. It is an /// extremely performance critical piece of code. This assumes that the buffer /// has a null character at the end of the file. This returns a preprocessing /// token, not a normal token, as such, it is an internal interface. It assumes /// that the Flags of result have been cleared before calling this. -void Lexer::LexTokenInternal(Token &Result) { +bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { LexNextToken: // New token, can't need cleaning yet. Result.clearFlag(Token::NeedsCleaning); @@ -2747,7 +2895,7 @@ LexNextToken: if (isKeepWhitespaceMode()) { FormTokenWithChars(Result, CurPtr, tok::unknown); // FIXME: The next token will not have LeadingSpace set. - return; + return true; } BufferPtr = CurPtr; @@ -2763,43 +2911,32 @@ LexNextToken: switch (Char) { case 0: // Null. // Found end of file? - if (CurPtr-1 == BufferEnd) { - // Read the PP instance variable into an automatic variable, because - // LexEndOfFile will often delete 'this'. - Preprocessor *PPCache = PP; - if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. - return; // Got a token to return. - assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); - return PPCache->Lex(Result); - } + if (CurPtr-1 == BufferEnd) + return LexEndOfFile(Result, CurPtr-1); // Check if we are performing code completion. if (isCodeCompletionPoint(CurPtr-1)) { // Return the code-completion token. Result.startToken(); FormTokenWithChars(Result, CurPtr, tok::code_completion); - return; + return true; } if (!isLexingRawMode()) Diag(CurPtr-1, diag::null_in_file); Result.setFlag(Token::LeadingSpace); - if (SkipWhitespace(Result, CurPtr)) - return; // KeepWhitespaceMode + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode - goto LexNextToken; // GCC isn't tail call eliminating. + // We know the lexer hasn't changed, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; case 26: // DOS & CP/M EOF: "^Z". // If we're in Microsoft extensions mode, treat this as end of file. - if (LangOpts.MicrosoftExt) { - // Read the PP instance variable into an automatic variable, because - // LexEndOfFile will often delete 'this'. - Preprocessor *PPCache = PP; - if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. - return; // Got a token to return. - assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); - return PPCache->Lex(Result); - } + if (LangOpts.MicrosoftExt) + return LexEndOfFile(Result, CurPtr-1); + // If Microsoft extensions are disabled, this is just random garbage. Kind = tok::unknown; break; @@ -2818,6 +2955,7 @@ LexNextToken: // Since we consumed a newline, we are back at the start of a line. IsAtStartOfLine = true; + IsAtPhysicalStartOfLine = true; Kind = tok::eod; break; @@ -2826,17 +2964,20 @@ LexNextToken: // No leading whitespace seen so far. Result.clearFlag(Token::LeadingSpace); - if (SkipWhitespace(Result, CurPtr)) - return; // KeepWhitespaceMode - goto LexNextToken; // GCC isn't tail call eliminating. + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; case ' ': case '\t': case '\f': case '\v': SkipHorizontalWhitespace: Result.setFlag(Token::LeadingSpace); - if (SkipWhitespace(Result, CurPtr)) - return; // KeepWhitespaceMode + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode SkipIgnoredUnits: CurPtr = BufferPtr; @@ -2844,18 +2985,21 @@ LexNextToken: // If the next token is obviously a // or /* */ comment, skip it efficiently // too (without going through the big switch stmt). if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && - LangOpts.LineComment && !LangOpts.TraditionalCPP) { - if (SkipLineComment(Result, CurPtr+2)) - return; // There is a token to return. + LangOpts.LineComment && + (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { + if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) + return true; // There is a token to return. goto SkipIgnoredUnits; } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { - if (SkipBlockComment(Result, CurPtr+2)) - return; // There is a token to return. + if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) + return true; // There is a token to return. goto SkipIgnoredUnits; } else if (isHorizontalWhitespace(*CurPtr)) { goto SkipHorizontalWhitespace; } - goto LexNextToken; // GCC isn't tail call eliminating. + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; // C99 6.4.4.1: Integer Constants. // C99 6.4.4.2: Floating Constants. @@ -3141,14 +3285,16 @@ LexNextToken: // "foo". Check to see if the character after the second slash is a '*'. // If so, we will lex that as a "/" instead of the start of a comment. // However, we never do this if we are just preprocessing. - bool TreatAsComment = LangOpts.LineComment && !LangOpts.TraditionalCPP; + bool TreatAsComment = LangOpts.LineComment && + (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); if (!TreatAsComment) if (!(PP && PP->isPreprocessedOutput())) TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; if (TreatAsComment) { - if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) - return; // There is a token to return. + if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), + TokAtPhysicalStartOfLine)) + return true; // There is a token to return. // It is common for the tokens immediately after a // comment to be // whitespace (indentation for the next line). Instead of going through @@ -3158,9 +3304,13 @@ LexNextToken: } if (Char == '*') { // /**/ comment. - if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) - return; // There is a token to return. - goto LexNextToken; // GCC isn't tail call eliminating. + if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), + TokAtPhysicalStartOfLine)) + return true; // There is a token to return. + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; } if (Char == '=') { @@ -3195,7 +3345,7 @@ LexNextToken: // it's actually the start of a preprocessing directive. Callback to // the preprocessor to handle it. // FIXME: -fpreprocessed mode?? - if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) + if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) goto HandleDirective; Kind = tok::hash; @@ -3361,7 +3511,7 @@ LexNextToken: // it's actually the start of a preprocessing directive. Callback to // the preprocessor to handle it. // FIXME: -fpreprocessed mode?? - if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) + if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) goto HandleDirective; Kind = tok::hash; @@ -3378,8 +3528,18 @@ LexNextToken: // UCNs (C99 6.4.3, C++11 [lex.charset]p2) case '\\': - if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) + if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { + if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } + return LexUnicode(Result, CodePoint, CurPtr); + } Kind = tok::unknown; break; @@ -3400,8 +3560,17 @@ LexNextToken: (const UTF8 *)BufferEnd, &CodePoint, strictConversion); - if (Status == conversionOK) + if (Status == conversionOK) { + if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { + if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) + return true; // KeepWhitespaceMode + + // We only saw whitespace, so just try again with this lexer. + // (We manually eliminate the tail call to avoid recursion.) + goto LexNextToken; + } return LexUnicode(Result, CodePoint, CurPtr); + } if (isLexingRawMode() || ParsingPreprocessorDirective || PP->isPreprocessedOutput()) { @@ -3416,6 +3585,9 @@ LexNextToken: Diag(CurPtr, diag::err_invalid_utf8); BufferPtr = CurPtr+1; + // We're pretending the character didn't exist, so just try again with + // this lexer. + // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; } } @@ -3425,7 +3597,7 @@ LexNextToken: // Update the location of token as well as BufferPtr. FormTokenWithChars(Result, CurPtr, Kind); - return; + return true; HandleDirective: // We parsed a # character and it's the start of a preprocessing directive. @@ -3433,18 +3605,12 @@ HandleDirective: FormTokenWithChars(Result, CurPtr, tok::hash); PP->HandleDirective(Result); - // As an optimization, if the preprocessor didn't switch lexers, tail - // recurse. - if (PP->isCurrentLexer(this)) { - // Start a new token. If this is a #include or something, the PP may - // want us starting at the beginning of the line again. If so, set - // the StartOfLine flag and clear LeadingSpace. - if (IsAtStartOfLine) { - Result.setFlag(Token::StartOfLine); - Result.clearFlag(Token::LeadingSpace); - IsAtStartOfLine = false; - } - goto LexNextToken; // GCC isn't tail call eliminating. + if (PP->hadModuleLoaderFatalFailure()) { + // With a fatal failure in the module loader, we abort parsing. + assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); + return true; } - return PP->Lex(Result); + + // We parsed the directive; lex a token with the new state. + return false; } |