aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Lex/Lexer.cpp')
-rw-r--r--contrib/llvm/tools/clang/lib/Lex/Lexer.cpp470
1 files changed, 318 insertions, 152 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
index 9958287ba474..c071455da662 100644
--- a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
+++ b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
@@ -29,6 +29,7 @@
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/CodeCompletionHandler.h"
#include "clang/Lex/LexDiagnostic.h"
+#include "clang/Lex/LiteralSupport.h"
#include "clang/Lex/Preprocessor.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
@@ -93,6 +94,10 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
// Start of the file is a start of line.
IsAtStartOfLine = true;
+ IsAtPhysicalStartOfLine = true;
+
+ HasLeadingSpace = false;
+ HasLeadingEmptyMacro = false;
// We are not after parsing a #.
ParsingPreprocessorDirective = false;
@@ -430,7 +435,8 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
/// \returns true if there was a failure, false on success.
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
const SourceManager &SM,
- const LangOptions &LangOpts) {
+ const LangOptions &LangOpts,
+ bool IgnoreWhiteSpace) {
// TODO: this could be special cased for common tokens like identifiers, ')',
// etc to make this faster, if it mattered. Just look at StrData[0] to handle
// all obviously single-char tokens. This could use
@@ -448,7 +454,7 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
const char *StrData = Buffer.data()+LocInfo.second;
- if (isWhitespace(StrData[0]))
+ if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
return true;
// Create a lexer starting at the beginning of this token.
@@ -798,14 +804,10 @@ bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
SourceLocation *MacroBegin) {
assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
- std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc);
- // FIXME: If the token comes from the macro token paste operator ('##')
- // this function will always return false;
- if (infoLoc.second > 0)
- return false; // Does not point at the start of token.
+ SourceLocation expansionLoc;
+ if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
+ return false;
- SourceLocation expansionLoc =
- SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart();
if (expansionLoc.isFileID()) {
// No other macro expansions, this is the first.
if (MacroBegin)
@@ -829,16 +831,11 @@ bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
if (tokLen == 0)
return false;
- FileID FID = SM.getFileID(loc);
- SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1);
- if (SM.isInFileID(afterLoc, FID))
- return false; // Still in the same FileID, does not point to the last token.
-
- // FIXME: If the token comes from the macro token paste operator ('##')
- // or the stringify operator ('#') this function will always return false;
+ SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
+ SourceLocation expansionLoc;
+ if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
+ return false;
- SourceLocation expansionLoc =
- SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd();
if (expansionLoc.isFileID()) {
// No other macro expansions.
if (MacroEnd)
@@ -916,25 +913,25 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
return makeRangeFromFileLocs(Range, SM, LangOpts);
}
- FileID FID;
- unsigned BeginOffs;
- llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
- if (FID.isInvalid())
+ bool Invalid = false;
+ const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
+ &Invalid);
+ if (Invalid)
return CharSourceRange();
- unsigned EndOffs;
- if (!SM.isInFileID(End, FID, &EndOffs) ||
- BeginOffs > EndOffs)
- return CharSourceRange();
+ if (BeginEntry.getExpansion().isMacroArgExpansion()) {
+ const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
+ &Invalid);
+ if (Invalid)
+ return CharSourceRange();
- const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
- const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
- if (Expansion.isMacroArgExpansion() &&
- Expansion.getSpellingLoc().isFileID()) {
- SourceLocation SpellLoc = Expansion.getSpellingLoc();
- Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs));
- Range.setEnd(SpellLoc.getLocWithOffset(EndOffs));
- return makeRangeFromFileLocs(Range, SM, LangOpts);
+ if (EndEntry.getExpansion().isMacroArgExpansion() &&
+ BeginEntry.getExpansion().getExpansionLocStart() ==
+ EndEntry.getExpansion().getExpansionLocStart()) {
+ Range.setBegin(SM.getImmediateSpellingLoc(Begin));
+ Range.setEnd(SM.getImmediateSpellingLoc(End));
+ return makeFileCharRange(Range, SM, LangOpts);
+ }
}
return CharSourceRange();
@@ -1369,26 +1366,42 @@ void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
BufferPtr += Bytes;
if (BufferPtr > BufferEnd)
BufferPtr = BufferEnd;
+ // FIXME: What exactly does the StartOfLine bit mean? There are two
+ // possible meanings for the "start" of the line: the first token on the
+ // unexpanded line, or the first token on the expanded line.
IsAtStartOfLine = StartOfLine;
+ IsAtPhysicalStartOfLine = StartOfLine;
}
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
- if (LangOpts.CPlusPlus11 || LangOpts.C11)
- return isCharInSet(C, C11AllowedIDChars);
- else if (LangOpts.CPlusPlus)
- return isCharInSet(C, CXX03AllowedIDChars);
- else
- return isCharInSet(C, C99AllowedIDChars);
+ if (LangOpts.CPlusPlus11 || LangOpts.C11) {
+ static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
+ C11AllowedIDCharRanges);
+ return C11AllowedIDChars.contains(C);
+ } else if (LangOpts.CPlusPlus) {
+ static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
+ CXX03AllowedIDCharRanges);
+ return CXX03AllowedIDChars.contains(C);
+ } else {
+ static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
+ C99AllowedIDCharRanges);
+ return C99AllowedIDChars.contains(C);
+ }
}
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
assert(isAllowedIDChar(C, LangOpts));
- if (LangOpts.CPlusPlus11 || LangOpts.C11)
- return !isCharInSet(C, C11DisallowedInitialIDChars);
- else if (LangOpts.CPlusPlus)
+ if (LangOpts.CPlusPlus11 || LangOpts.C11) {
+ static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
+ C11DisallowedInitialIDCharRanges);
+ return !C11DisallowedInitialIDChars.contains(C);
+ } else if (LangOpts.CPlusPlus) {
return true;
- else
- return !isCharInSet(C, C99DisallowedInitialIDChars);
+ } else {
+ static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
+ C99DisallowedInitialIDCharRanges);
+ return !C99DisallowedInitialIDChars.contains(C);
+ }
}
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
@@ -1407,11 +1420,15 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
CannotStartIdentifier
};
- if (!isCharInSet(C, C99AllowedIDChars)) {
+ static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
+ C99AllowedIDCharRanges);
+ static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
+ C99DisallowedInitialIDCharRanges);
+ if (!C99AllowedIDChars.contains(C)) {
Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
<< Range
<< CannotAppearInIdentifier;
- } else if (IsFirst && isCharInSet(C, C99DisallowedInitialIDChars)) {
+ } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
<< Range
<< CannotStartIdentifier;
@@ -1421,14 +1438,16 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
// Check C++98 compatibility.
if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_unicode_id,
Range.getBegin()) > DiagnosticsEngine::Ignored) {
- if (!isCharInSet(C, CXX03AllowedIDChars)) {
+ static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
+ CXX03AllowedIDCharRanges);
+ if (!CXX03AllowedIDChars.contains(C)) {
Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
<< Range;
}
}
}
-void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
+bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
unsigned Size;
unsigned char C = *CurPtr++;
@@ -1452,7 +1471,7 @@ FinishIdentifier:
// If we are in raw mode, return this identifier raw. There is no need to
// look up identifier information or attempt to macro expand it.
if (LexingRawMode)
- return;
+ return true;
// Fill in Result.IdentifierInfo and update the token kind,
// looking up the identifier in the identifier table.
@@ -1461,9 +1480,9 @@ FinishIdentifier:
// Finally, now that we know we have an identifier, pass this off to the
// preprocessor, which may macro expand it or something.
if (II->isHandleIdentifierCase())
- PP->HandleIdentifier(Result);
+ return PP->HandleIdentifier(Result);
- return;
+ return true;
}
// Otherwise, $,\,? in identifier found. Enter slower path.
@@ -1553,7 +1572,7 @@ bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
/// LexNumericConstant - Lex the remainder of a integer or floating point
/// constant. From[-1] is the first character lexed. Return the end of the
/// constant.
-void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
+bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
@@ -1587,15 +1606,29 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
}
+ // If we have a digit separator, continue.
+ if (C == '\'' && getLangOpts().CPlusPlus1y) {
+ unsigned NextSize;
+ char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
+ if (isIdentifierBody(Next)) {
+ if (!isLexingRawMode())
+ Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
+ CurPtr = ConsumeChar(CurPtr, Size, Result);
+ return LexNumericConstant(Result, CurPtr);
+ }
+ }
+
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
Result.setLiteralData(TokStart);
+ return true;
}
/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
/// in C++11, or warn on a ud-suffix in C++98.
-const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) {
+const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
+ bool IsStringLiteral) {
assert(getLangOpts().CPlusPlus);
// Maximally munch an identifier. FIXME: UCNs.
@@ -1615,9 +1648,41 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) {
// that does not start with an underscore is ill-formed. As a conforming
// extension, we treat all such suffixes as if they had whitespace before
// them.
- if (C != '_') {
+ bool IsUDSuffix = false;
+ if (C == '_')
+ IsUDSuffix = true;
+ else if (IsStringLiteral && getLangOpts().CPlusPlus1y) {
+ // In C++1y, we need to look ahead a few characters to see if this is a
+ // valid suffix for a string literal or a numeric literal (this could be
+ // the 'operator""if' defining a numeric literal operator).
+ const unsigned MaxStandardSuffixLength = 3;
+ char Buffer[MaxStandardSuffixLength] = { C };
+ unsigned Consumed = Size;
+ unsigned Chars = 1;
+ while (true) {
+ unsigned NextSize;
+ char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
+ getLangOpts());
+ if (!isIdentifierBody(Next)) {
+ // End of suffix. Check whether this is on the whitelist.
+ IsUDSuffix = (Chars == 1 && Buffer[0] == 's') ||
+ NumericLiteralParser::isValidUDSuffix(
+ getLangOpts(), StringRef(Buffer, Chars));
+ break;
+ }
+
+ if (Chars == MaxStandardSuffixLength)
+ // Too long: can't be a standard suffix.
+ break;
+
+ Buffer[Chars++] = Next;
+ Consumed += NextSize;
+ }
+ }
+
+ if (!IsUDSuffix) {
if (!isLexingRawMode())
- Diag(CurPtr, getLangOpts().MicrosoftMode ?
+ Diag(CurPtr, getLangOpts().MicrosoftMode ?
diag::ext_ms_reserved_user_defined_literal :
diag::ext_reserved_user_defined_literal)
<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
@@ -1635,7 +1700,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) {
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
/// either " or L" or u8" or u" or U".
-void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
+bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
@@ -1659,14 +1724,15 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
Diag(BufferPtr, diag::ext_unterminated_string);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
- return;
+ return true;
}
if (C == 0) {
if (isCodeCompletionPoint(CurPtr-1)) {
PP->CodeCompleteNaturalLanguage();
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
- return cutOffLexing();
+ cutOffLexing();
+ return true;
}
NulCharacter = CurPtr-1;
@@ -1676,7 +1742,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
// If we are in C++11, lex the optional ud-suffix.
if (getLangOpts().CPlusPlus)
- CurPtr = LexUDSuffix(Result, CurPtr);
+ CurPtr = LexUDSuffix(Result, CurPtr, true);
// If a nul character existed in the string, warn about it.
if (NulCharacter && !isLexingRawMode())
@@ -1686,11 +1752,12 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
+ return true;
}
/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
/// having lexed R", LR", u8R", uR", or UR".
-void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
+bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
tok::TokenKind Kind) {
// This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
// Between the initial and final double quote characters of the raw string,
@@ -1732,7 +1799,7 @@ void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
}
FormTokenWithChars(Result, CurPtr, tok::unknown);
- return;
+ return true;
}
// Save prefix and move CurPtr past it
@@ -1753,23 +1820,24 @@ void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
Diag(BufferPtr, diag::err_unterminated_raw_string)
<< StringRef(Prefix, PrefixLen);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
- return;
+ return true;
}
}
// If we are in C++11, lex the optional ud-suffix.
if (getLangOpts().CPlusPlus)
- CurPtr = LexUDSuffix(Result, CurPtr);
+ CurPtr = LexUDSuffix(Result, CurPtr, true);
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
+ return true;
}
/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
/// after having lexed the '<' character. This is used for #include filenames.
-void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
+bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
const char *AfterLessPos = CurPtr;
char C = getAndAdvanceChar(CurPtr, Result);
@@ -1784,7 +1852,7 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
// If the filename is unterminated, then it must just be a lone <
// character. Return this as such.
FormTokenWithChars(Result, AfterLessPos, tok::less);
- return;
+ return true;
} else if (C == 0) {
NulCharacter = CurPtr-1;
}
@@ -1799,12 +1867,13 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
Result.setLiteralData(TokStart);
+ return true;
}
/// LexCharConstant - Lex the remainder of a character constant, after having
/// lexed either ' or L' or u' or U'.
-void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
+bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this character contain the \0 character?
@@ -1819,7 +1888,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
Diag(BufferPtr, diag::ext_empty_character);
FormTokenWithChars(Result, CurPtr, tok::unknown);
- return;
+ return true;
}
while (C != '\'') {
@@ -1832,14 +1901,15 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
Diag(BufferPtr, diag::ext_unterminated_char);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
- return;
+ return true;
}
if (C == 0) {
if (isCodeCompletionPoint(CurPtr-1)) {
PP->CodeCompleteNaturalLanguage();
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
- return cutOffLexing();
+ cutOffLexing();
+ return true;
}
NulCharacter = CurPtr-1;
@@ -1849,7 +1919,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
// If we are in C++11, lex the optional ud-suffix.
if (getLangOpts().CPlusPlus)
- CurPtr = LexUDSuffix(Result, CurPtr);
+ CurPtr = LexUDSuffix(Result, CurPtr, false);
// If a nul character existed in the character, warn about it.
if (NulCharacter && !isLexingRawMode())
@@ -1859,6 +1929,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
+ return true;
}
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
@@ -1866,11 +1937,14 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
///
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
///
-bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
+bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
+ bool &TokAtPhysicalStartOfLine) {
// Whitespace - Skip it, then return the token after the whitespace.
bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
- unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently.
+ unsigned char Char = *CurPtr;
+
+ // Skip consecutive spaces efficiently.
while (1) {
// Skip horizontal whitespace very aggressively.
while (isHorizontalWhitespace(Char))
@@ -1886,7 +1960,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
return false;
}
- // ok, but handle newline.
+ // OK, but handle newline.
SawNewline = true;
Char = *++CurPtr;
}
@@ -1894,8 +1968,10 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
// If the client wants us to return whitespace, return it now.
if (isKeepWhitespaceMode()) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
- if (SawNewline)
+ if (SawNewline) {
IsAtStartOfLine = true;
+ IsAtPhysicalStartOfLine = true;
+ }
// FIXME: The next token will not have LeadingSpace set.
return true;
}
@@ -1905,8 +1981,10 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
- if (SawNewline)
+ if (SawNewline) {
Result.setFlag(Token::StartOfLine);
+ TokAtPhysicalStartOfLine = true;
+ }
BufferPtr = CurPtr;
return false;
@@ -1918,7 +1996,8 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
///
/// If we're in KeepCommentMode or any CommentHandler has inserted
/// some tokens, this will store the first token and return true.
-bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) {
+bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
+ bool &TokAtPhysicalStartOfLine) {
// If Line comments aren't explicitly enabled for this language, emit an
// extension warning.
if (!LangOpts.LineComment && !isLexingRawMode()) {
@@ -2037,6 +2116,7 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) {
// The next returned token is at the start of the line.
Result.setFlag(Token::StartOfLine);
+ TokAtPhysicalStartOfLine = true;
// No leading whitespace seen so far.
Result.clearFlag(Token::LeadingSpace);
BufferPtr = CurPtr;
@@ -2147,7 +2227,8 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
///
/// If we're in KeepCommentMode or any CommentHandler has inserted
/// some tokens, this will store the first token and return true.
-bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
+bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
+ bool &TokAtPhysicalStartOfLine) {
// Scan one character past where we should, looking for a '/' character. Once
// we find it, check to see if it was preceded by a *. This common
// optimization helps people who like to put a lot of * characters in their
@@ -2202,7 +2283,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
// Adjust the pointer to point directly after the first slash. It's
// not necessary to set C here, it will be overwritten at the end of
// the outer loop.
- CurPtr += llvm::CountTrailingZeros_32(cmp) + 1;
+ CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
goto FoundSlash;
}
CurPtr += 16;
@@ -2298,7 +2379,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
// efficiently now. This is safe even in KeepWhitespaceMode because we would
// have already returned above with the comment as a token.
if (isHorizontalWhitespace(*CurPtr)) {
- SkipWhitespace(Result, CurPtr+1);
+ SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
return false;
}
@@ -2404,10 +2485,28 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
// C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
// a pedwarn.
- if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
- Diag(BufferEnd, LangOpts.CPlusPlus11 ? // C++11 [lex.phases] 2.2 p2
- diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof)
- << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
+ if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
+ DiagnosticsEngine &Diags = PP->getDiagnostics();
+ SourceLocation EndLoc = getSourceLocation(BufferEnd);
+ unsigned DiagID;
+
+ if (LangOpts.CPlusPlus11) {
+ // C++11 [lex.phases] 2.2 p2
+ // Prefer the C++98 pedantic compatibility warning over the generic,
+ // non-extension, user-requested "missing newline at EOF" warning.
+ if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_no_newline_eof,
+ EndLoc) != DiagnosticsEngine::Ignored) {
+ DiagID = diag::warn_cxx98_compat_no_newline_eof;
+ } else {
+ DiagID = diag::warn_no_newline_eof;
+ }
+ } else {
+ DiagID = diag::ext_no_newline_eof;
+ }
+
+ Diag(BufferEnd, DiagID)
+ << FixItHint::CreateInsertion(EndLoc, "\n");
+ }
BufferPtr = CurPtr;
@@ -2430,14 +2529,19 @@ unsigned Lexer::isNextPPTokenLParen() {
// Save state that can be changed while lexing so that we can restore it.
const char *TmpBufferPtr = BufferPtr;
bool inPPDirectiveMode = ParsingPreprocessorDirective;
+ bool atStartOfLine = IsAtStartOfLine;
+ bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
+ bool leadingSpace = HasLeadingSpace;
Token Tok;
- Tok.startToken();
- LexTokenInternal(Tok);
+ Lex(Tok);
// Restore state that may have changed.
BufferPtr = TmpBufferPtr;
ParsingPreprocessorDirective = inPPDirectiveMode;
+ HasLeadingSpace = leadingSpace;
+ IsAtStartOfLine = atStartOfLine;
+ IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
// Restore the lexer back to non-skipping mode.
LexingRawMode = false;
@@ -2626,6 +2730,10 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
StartPtr = CurPtr;
}
+ // Don't apply C family restrictions to UCNs in assembly mode
+ if (LangOpts.AsmPreprocessor)
+ return CodePoint;
+
// C99 6.4.3p2: A universal character name shall not specify a character whose
// short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
// 0060 (`), nor one in the range D800 through DFFF inclusive.)
@@ -2670,19 +2778,22 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
return CodePoint;
}
-void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
+bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
+ const char *CurPtr) {
+ static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+ UnicodeWhitespaceCharRanges);
if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
- isCharInSet(C, UnicodeWhitespaceChars)) {
+ UnicodeWhitespaceChars.contains(C)) {
Diag(BufferPtr, diag::ext_unicode_whitespace)
<< makeCharRange(*this, BufferPtr, CurPtr);
Result.setFlag(Token::LeadingSpace);
- if (SkipWhitespace(Result, CurPtr))
- return; // KeepWhitespaceMode
-
- return LexTokenInternal(Result);
+ return true;
}
+ return false;
+}
+bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput()) {
@@ -2711,22 +2822,59 @@ void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
<< FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
BufferPtr = CurPtr;
- return LexTokenInternal(Result);
+ return false;
}
// Otherwise, we have an explicit UCN or a character that's unlikely to show
// up by accident.
MIOpt.ReadToken();
FormTokenWithChars(Result, CurPtr, tok::unknown);
+ return true;
}
+void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
+ IsAtStartOfLine = Result.isAtStartOfLine();
+ HasLeadingSpace = Result.hasLeadingSpace();
+ HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
+ // Note that this doesn't affect IsAtPhysicalStartOfLine.
+}
+
+bool Lexer::Lex(Token &Result) {
+ // Start a new token.
+ Result.startToken();
+
+ // Set up misc whitespace flags for LexTokenInternal.
+ if (IsAtStartOfLine) {
+ Result.setFlag(Token::StartOfLine);
+ IsAtStartOfLine = false;
+ }
+
+ if (HasLeadingSpace) {
+ Result.setFlag(Token::LeadingSpace);
+ HasLeadingSpace = false;
+ }
+
+ if (HasLeadingEmptyMacro) {
+ Result.setFlag(Token::LeadingEmptyMacro);
+ HasLeadingEmptyMacro = false;
+ }
+
+ bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
+ IsAtPhysicalStartOfLine = false;
+ bool isRawLex = isLexingRawMode();
+ (void) isRawLex;
+ bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
+ // (After the LexTokenInternal call, the lexer might be destroyed.)
+ assert((returnedToken || !isRawLex) && "Raw lex must succeed");
+ return returnedToken;
+}
/// LexTokenInternal - This implements a simple C family lexer. It is an
/// extremely performance critical piece of code. This assumes that the buffer
/// has a null character at the end of the file. This returns a preprocessing
/// token, not a normal token, as such, it is an internal interface. It assumes
/// that the Flags of result have been cleared before calling this.
-void Lexer::LexTokenInternal(Token &Result) {
+bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
LexNextToken:
// New token, can't need cleaning yet.
Result.clearFlag(Token::NeedsCleaning);
@@ -2747,7 +2895,7 @@ LexNextToken:
if (isKeepWhitespaceMode()) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
// FIXME: The next token will not have LeadingSpace set.
- return;
+ return true;
}
BufferPtr = CurPtr;
@@ -2763,43 +2911,32 @@ LexNextToken:
switch (Char) {
case 0: // Null.
// Found end of file?
- if (CurPtr-1 == BufferEnd) {
- // Read the PP instance variable into an automatic variable, because
- // LexEndOfFile will often delete 'this'.
- Preprocessor *PPCache = PP;
- if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
- return; // Got a token to return.
- assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
- return PPCache->Lex(Result);
- }
+ if (CurPtr-1 == BufferEnd)
+ return LexEndOfFile(Result, CurPtr-1);
// Check if we are performing code completion.
if (isCodeCompletionPoint(CurPtr-1)) {
// Return the code-completion token.
Result.startToken();
FormTokenWithChars(Result, CurPtr, tok::code_completion);
- return;
+ return true;
}
if (!isLexingRawMode())
Diag(CurPtr-1, diag::null_in_file);
Result.setFlag(Token::LeadingSpace);
- if (SkipWhitespace(Result, CurPtr))
- return; // KeepWhitespaceMode
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
- goto LexNextToken; // GCC isn't tail call eliminating.
+ // We know the lexer hasn't changed, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
case 26: // DOS & CP/M EOF: "^Z".
// If we're in Microsoft extensions mode, treat this as end of file.
- if (LangOpts.MicrosoftExt) {
- // Read the PP instance variable into an automatic variable, because
- // LexEndOfFile will often delete 'this'.
- Preprocessor *PPCache = PP;
- if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
- return; // Got a token to return.
- assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
- return PPCache->Lex(Result);
- }
+ if (LangOpts.MicrosoftExt)
+ return LexEndOfFile(Result, CurPtr-1);
+
// If Microsoft extensions are disabled, this is just random garbage.
Kind = tok::unknown;
break;
@@ -2818,6 +2955,7 @@ LexNextToken:
// Since we consumed a newline, we are back at the start of a line.
IsAtStartOfLine = true;
+ IsAtPhysicalStartOfLine = true;
Kind = tok::eod;
break;
@@ -2826,17 +2964,20 @@ LexNextToken:
// No leading whitespace seen so far.
Result.clearFlag(Token::LeadingSpace);
- if (SkipWhitespace(Result, CurPtr))
- return; // KeepWhitespaceMode
- goto LexNextToken; // GCC isn't tail call eliminating.
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
+
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
case ' ':
case '\t':
case '\f':
case '\v':
SkipHorizontalWhitespace:
Result.setFlag(Token::LeadingSpace);
- if (SkipWhitespace(Result, CurPtr))
- return; // KeepWhitespaceMode
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
SkipIgnoredUnits:
CurPtr = BufferPtr;
@@ -2844,18 +2985,21 @@ LexNextToken:
// If the next token is obviously a // or /* */ comment, skip it efficiently
// too (without going through the big switch stmt).
if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
- LangOpts.LineComment && !LangOpts.TraditionalCPP) {
- if (SkipLineComment(Result, CurPtr+2))
- return; // There is a token to return.
+ LangOpts.LineComment &&
+ (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
+ if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
+ return true; // There is a token to return.
goto SkipIgnoredUnits;
} else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
- if (SkipBlockComment(Result, CurPtr+2))
- return; // There is a token to return.
+ if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
+ return true; // There is a token to return.
goto SkipIgnoredUnits;
} else if (isHorizontalWhitespace(*CurPtr)) {
goto SkipHorizontalWhitespace;
}
- goto LexNextToken; // GCC isn't tail call eliminating.
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
@@ -3141,14 +3285,16 @@ LexNextToken:
// "foo". Check to see if the character after the second slash is a '*'.
// If so, we will lex that as a "/" instead of the start of a comment.
// However, we never do this if we are just preprocessing.
- bool TreatAsComment = LangOpts.LineComment && !LangOpts.TraditionalCPP;
+ bool TreatAsComment = LangOpts.LineComment &&
+ (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
if (!TreatAsComment)
if (!(PP && PP->isPreprocessedOutput()))
TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
if (TreatAsComment) {
- if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
- return; // There is a token to return.
+ if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ TokAtPhysicalStartOfLine))
+ return true; // There is a token to return.
// It is common for the tokens immediately after a // comment to be
// whitespace (indentation for the next line). Instead of going through
@@ -3158,9 +3304,13 @@ LexNextToken:
}
if (Char == '*') { // /**/ comment.
- if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
- return; // There is a token to return.
- goto LexNextToken; // GCC isn't tail call eliminating.
+ if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ TokAtPhysicalStartOfLine))
+ return true; // There is a token to return.
+
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
}
if (Char == '=') {
@@ -3195,7 +3345,7 @@ LexNextToken:
// it's actually the start of a preprocessing directive. Callback to
// the preprocessor to handle it.
// FIXME: -fpreprocessed mode??
- if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
+ if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
goto HandleDirective;
Kind = tok::hash;
@@ -3361,7 +3511,7 @@ LexNextToken:
// it's actually the start of a preprocessing directive. Callback to
// the preprocessor to handle it.
// FIXME: -fpreprocessed mode??
- if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
+ if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
goto HandleDirective;
Kind = tok::hash;
@@ -3378,8 +3528,18 @@ LexNextToken:
// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
case '\\':
- if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
+ if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
+ if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
+
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
+ }
+
return LexUnicode(Result, CodePoint, CurPtr);
+ }
Kind = tok::unknown;
break;
@@ -3400,8 +3560,17 @@ LexNextToken:
(const UTF8 *)BufferEnd,
&CodePoint,
strictConversion);
- if (Status == conversionOK)
+ if (Status == conversionOK) {
+ if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
+ if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
+ return true; // KeepWhitespaceMode
+
+ // We only saw whitespace, so just try again with this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
+ goto LexNextToken;
+ }
return LexUnicode(Result, CodePoint, CurPtr);
+ }
if (isLexingRawMode() || ParsingPreprocessorDirective ||
PP->isPreprocessedOutput()) {
@@ -3416,6 +3585,9 @@ LexNextToken:
Diag(CurPtr, diag::err_invalid_utf8);
BufferPtr = CurPtr+1;
+ // We're pretending the character didn't exist, so just try again with
+ // this lexer.
+ // (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
}
@@ -3425,7 +3597,7 @@ LexNextToken:
// Update the location of token as well as BufferPtr.
FormTokenWithChars(Result, CurPtr, Kind);
- return;
+ return true;
HandleDirective:
// We parsed a # character and it's the start of a preprocessing directive.
@@ -3433,18 +3605,12 @@ HandleDirective:
FormTokenWithChars(Result, CurPtr, tok::hash);
PP->HandleDirective(Result);
- // As an optimization, if the preprocessor didn't switch lexers, tail
- // recurse.
- if (PP->isCurrentLexer(this)) {
- // Start a new token. If this is a #include or something, the PP may
- // want us starting at the beginning of the line again. If so, set
- // the StartOfLine flag and clear LeadingSpace.
- if (IsAtStartOfLine) {
- Result.setFlag(Token::StartOfLine);
- Result.clearFlag(Token::LeadingSpace);
- IsAtStartOfLine = false;
- }
- goto LexNextToken; // GCC isn't tail call eliminating.
+ if (PP->hadModuleLoaderFatalFailure()) {
+ // With a fatal failure in the module loader, we abort parsing.
+ assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
+ return true;
}
- return PP->Lex(Result);
+
+ // We parsed the directive; lex a token with the new state.
+ return false;
}