aboutsummaryrefslogtreecommitdiff
path: root/clang/lib/Tooling/Transformer/SourceCode.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/Tooling/Transformer/SourceCode.cpp')
-rw-r--r--clang/lib/Tooling/Transformer/SourceCode.cpp370
1 files changed, 353 insertions, 17 deletions
diff --git a/clang/lib/Tooling/Transformer/SourceCode.cpp b/clang/lib/Tooling/Transformer/SourceCode.cpp
index 836401d1e605..26b204851f05 100644
--- a/clang/lib/Tooling/Transformer/SourceCode.cpp
+++ b/clang/lib/Tooling/Transformer/SourceCode.cpp
@@ -10,10 +10,24 @@
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Transformer/SourceCode.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/Comment.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/SourceManager.h"
#include "clang/Lex/Lexer.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include <set>
using namespace clang;
+using llvm::errc;
+using llvm::StringError;
+
StringRef clang::tooling::getText(CharSourceRange Range,
const ASTContext &Context) {
return Lexer::getSourceText(Range, Context.getSourceManager(),
@@ -23,11 +37,45 @@ StringRef clang::tooling::getText(CharSourceRange Range,
CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
tok::TokenKind Next,
ASTContext &Context) {
- Optional<Token> Tok = Lexer::findNextToken(
- Range.getEnd(), Context.getSourceManager(), Context.getLangOpts());
- if (!Tok || !Tok->is(Next))
+ CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
+ Context.getLangOpts());
+ if (R.isInvalid())
+ return Range;
+ Token Tok;
+ bool Err =
+ Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
+ Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
+ if (Err || !Tok.is(Next))
return Range;
- return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation());
+ return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
+}
+
+llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
+ const SourceManager &SM) {
+ if (Range.isInvalid())
+ return llvm::make_error<StringError>(errc::invalid_argument,
+ "Invalid range");
+
+ if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
+ return llvm::make_error<StringError>(
+ errc::invalid_argument, "Range starts or ends in a macro expansion");
+
+ if (SM.isInSystemHeader(Range.getBegin()) ||
+ SM.isInSystemHeader(Range.getEnd()))
+ return llvm::make_error<StringError>(errc::invalid_argument,
+ "Range is in system header");
+
+ std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
+ std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
+ if (BeginInfo.first != EndInfo.first)
+ return llvm::make_error<StringError>(
+ errc::invalid_argument, "Range begins and ends in different files");
+
+ if (BeginInfo.second > EndInfo.second)
+ return llvm::make_error<StringError>(
+ errc::invalid_argument, "Range's begin is past its end");
+
+ return llvm::Error::success();
}
llvm::Optional<CharSourceRange>
@@ -46,20 +94,308 @@ clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
// foo(DO_NOTHING(6))
// Decide whether the current behavior is desirable and modify if not.
CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
- if (Range.isInvalid())
- return None;
+ bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
+ if (IsInvalid)
+ return llvm::None;
+ return Range;
- if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
- return None;
- if (SM.isInSystemHeader(Range.getBegin()) ||
- SM.isInSystemHeader(Range.getEnd()))
- return None;
+}
- std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
- std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
- if (BeginInfo.first != EndInfo.first ||
- BeginInfo.second > EndInfo.second)
- return None;
+static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
+ return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
+}
- return Range;
+static bool contains(const std::set<tok::TokenKind> &Terminators,
+ const Token &Tok) {
+ return Terminators.count(Tok.getKind()) > 0;
+}
+
+// Returns the exclusive, *file* end location of the entity whose last token is
+// at location 'EntityLast'. That is, it returns the location one past the last
+// relevant character.
+//
+// Associated tokens include comments, horizontal whitespace and 'Terminators'
+// -- optional tokens, which, if any are found, will be included; if
+// 'Terminators' is empty, we will not include any extra tokens beyond comments
+// and horizontal whitespace.
+static SourceLocation
+getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
+ const std::set<tok::TokenKind> &Terminators,
+ const LangOptions &LangOpts) {
+ assert(EntityLast.isValid() && "Invalid end location found.");
+
+ // We remember the last location of a non-horizontal-whitespace token we have
+ // lexed; this is the location up to which we will want to delete.
+ // FIXME: Support using the spelling loc here for cases where we want to
+ // analyze the macro text.
+
+ CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
+ // FIXME: Should check isTokenRange(), for the (rare) case that
+ // `ExpansionRange` is a character range.
+ std::unique_ptr<Lexer> Lexer = [&]() {
+ bool Invalid = false;
+ auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
+ llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
+ assert(!Invalid && "Cannot get file/offset");
+ return std::make_unique<clang::Lexer>(
+ SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
+ File.data() + FileOffset.second, File.end());
+ }();
+
+ // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
+ Lexer->SetKeepWhitespaceMode(true);
+
+ // Generally, the code we want to include looks like this ([] are optional),
+ // If Terminators is empty:
+ // [ <comment> ] [ <newline> ]
+ // Otherwise:
+ // ... <terminator> [ <comment> ] [ <newline> ]
+
+ Token Tok;
+ bool Terminated = false;
+
+ // First, lex to the current token (which is the last token of the range that
+ // is definitely associated with the decl). Then, we process the first token
+ // separately from the rest based on conditions that hold specifically for
+ // that first token.
+ //
+ // We do not search for a terminator if none is required or we've already
+ // encountered it. Otherwise, if the original `EntityLast` location was in a
+ // macro expansion, we don't have visibility into the text, so we assume we've
+ // already terminated. However, we note this assumption with
+ // `TerminatedByMacro`, because we'll want to handle it somewhat differently
+ // for the terminators semicolon and comma. These terminators can be safely
+ // associated with the entity when they appear after the macro -- extra
+ // semicolons have no effect on the program and a well-formed program won't
+ // have multiple commas in a row, so we're guaranteed that there is only one.
+ //
+ // FIXME: This handling of macros is more conservative than necessary. When
+ // the end of the expansion coincides with the end of the node, we can still
+ // safely analyze the code. But, it is more complicated, because we need to
+ // start by lexing the spelling loc for the first token and then switch to the
+ // expansion loc.
+ bool TerminatedByMacro = false;
+ Lexer->LexFromRawLexer(Tok);
+ if (Terminators.empty() || contains(Terminators, Tok))
+ Terminated = true;
+ else if (EntityLast.isMacroID()) {
+ Terminated = true;
+ TerminatedByMacro = true;
+ }
+
+ // We save the most recent candidate for the exclusive end location.
+ SourceLocation End = Tok.getEndLoc();
+
+ while (!Terminated) {
+ // Lex the next token we want to possibly expand the range with.
+ Lexer->LexFromRawLexer(Tok);
+
+ switch (Tok.getKind()) {
+ case tok::eof:
+ // Unexpected separators.
+ case tok::l_brace:
+ case tok::r_brace:
+ case tok::comma:
+ return End;
+ // Whitespace pseudo-tokens.
+ case tok::unknown:
+ if (startsWithNewline(SM, Tok))
+ // Include at least until the end of the line.
+ End = Tok.getEndLoc();
+ break;
+ default:
+ if (contains(Terminators, Tok))
+ Terminated = true;
+ End = Tok.getEndLoc();
+ break;
+ }
+ }
+
+ do {
+ // Lex the next token we want to possibly expand the range with.
+ Lexer->LexFromRawLexer(Tok);
+
+ switch (Tok.getKind()) {
+ case tok::unknown:
+ if (startsWithNewline(SM, Tok))
+ // We're done, but include this newline.
+ return Tok.getEndLoc();
+ break;
+ case tok::comment:
+ // Include any comments we find on the way.
+ End = Tok.getEndLoc();
+ break;
+ case tok::semi:
+ case tok::comma:
+ if (TerminatedByMacro && contains(Terminators, Tok)) {
+ End = Tok.getEndLoc();
+ // We've found a real terminator.
+ TerminatedByMacro = false;
+ break;
+ }
+ // Found an unrelated token; stop and don't include it.
+ return End;
+ default:
+ // Found an unrelated token; stop and don't include it.
+ return End;
+ }
+ } while (true);
+}
+
+// Returns the expected terminator tokens for the given declaration.
+//
+// If we do not know the correct terminator token, returns an empty set.
+//
+// There are cases where we have more than one possible terminator (for example,
+// we find either a comma or a semicolon after a VarDecl).
+static std::set<tok::TokenKind> getTerminators(const Decl &D) {
+ if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
+ return {tok::semi};
+
+ if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
+ return {tok::r_brace, tok::semi};
+
+ if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
+ return {tok::comma, tok::semi};
+
+ return {};
+}
+
+// Starting from `Loc`, skips whitespace up to, and including, a single
+// newline. Returns the (exclusive) end of any skipped whitespace (that is, the
+// location immediately after the whitespace).
+static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
+ SourceLocation Loc,
+ const LangOptions &LangOpts) {
+ const char *LocChars = SM.getCharacterData(Loc);
+ int i = 0;
+ while (isHorizontalWhitespace(LocChars[i]))
+ ++i;
+ if (isVerticalWhitespace(LocChars[i]))
+ ++i;
+ return Loc.getLocWithOffset(i);
+}
+
+// Is `Loc` separated from any following decl by something meaningful (e.g. an
+// empty line, a comment), ignoring horizontal whitespace? Since this is a
+// heuristic, we return false when in doubt. `Loc` cannot be the first location
+// in the file.
+static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
+ const LangOptions &LangOpts) {
+ // If the preceding character is a newline, we'll check for an empty line as a
+ // separator. However, we can't identify an empty line using tokens, so we
+ // analyse the characters. If we try to use tokens, we'll just end up with a
+ // whitespace token, whose characters we'd have to analyse anyhow.
+ bool Invalid = false;
+ const char *LocChars =
+ SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
+ assert(!Invalid &&
+ "Loc must be a valid character and not the first of the source file.");
+ if (isVerticalWhitespace(LocChars[0])) {
+ for (int i = 1; isWhitespace(LocChars[i]); ++i)
+ if (isVerticalWhitespace(LocChars[i]))
+ return true;
+ }
+ // We didn't find an empty line, so lex the next token, skipping past any
+ // whitespace we just scanned.
+ Token Tok;
+ bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
+ /*IgnoreWhiteSpace=*/true);
+ if (Failed)
+ // Any text that confuses the lexer seems fair to consider a separation.
+ return true;
+
+ switch (Tok.getKind()) {
+ case tok::comment:
+ case tok::l_brace:
+ case tok::r_brace:
+ case tok::eof:
+ return true;
+ default:
+ return false;
+ }
+}
+
+CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
+ ASTContext &Context) {
+ const SourceManager &SM = Context.getSourceManager();
+ const LangOptions &LangOpts = Context.getLangOpts();
+ CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
+
+ // First, expand to the start of the template<> declaration if necessary.
+ if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
+ if (const auto *T = Record->getDescribedClassTemplate())
+ if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+ Range.setBegin(T->getBeginLoc());
+ } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
+ if (const auto *T = F->getDescribedFunctionTemplate())
+ if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+ Range.setBegin(T->getBeginLoc());
+ }
+
+ // Next, expand the end location past trailing comments to include a potential
+ // newline at the end of the decl's line.
+ Range.setEnd(
+ getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
+ Range.setTokenRange(false);
+
+ // Expand to include preceeding associated comments. We ignore any comments
+ // that are not preceeding the decl, since we've already skipped trailing
+ // comments with getEntityEndLoc.
+ if (const RawComment *Comment =
+ Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
+ // Only include a preceding comment if:
+ // * it is *not* separate from the declaration (not including any newline
+ // that immediately follows the comment),
+ // * the decl *is* separate from any following entity (so, there are no
+ // other entities the comment could refer to), and
+ // * it is not a IfThisThenThat lint check.
+ if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
+ Range.getBegin()) &&
+ !atOrBeforeSeparation(
+ SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
+ LangOpts) &&
+ atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
+ const StringRef CommentText = Comment->getRawText(SM);
+ if (!CommentText.contains("LINT.IfChange") &&
+ !CommentText.contains("LINT.ThenChange"))
+ Range.setBegin(Comment->getBeginLoc());
+ }
+ // Add leading attributes.
+ for (auto *Attr : Decl.attrs()) {
+ if (Attr->getLocation().isInvalid() ||
+ !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
+ continue;
+ Range.setBegin(Attr->getLocation());
+
+ // Extend to the left '[[' or '__attribute((' if we saw the attribute,
+ // unless it is not a valid location.
+ bool Invalid;
+ StringRef Source =
+ SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
+ if (Invalid)
+ continue;
+ llvm::StringRef BeforeAttr =
+ Source.substr(0, SM.getFileOffset(Range.getBegin()));
+ llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
+
+ for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
+ // Handle whitespace between attribute prefix and attribute value.
+ if (BeforeAttrStripped.endswith(Prefix)) {
+ // Move start to start position of prefix, which is
+ // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
+ // positions to the left.
+ Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
+ -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
+ break;
+ // If we didn't see '[[' or '__attribute' it's probably coming from a
+ // macro expansion which is already handled by makeFileCharRange(),
+ // below.
+ }
+ }
+ }
+
+ // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
+ // Range.getBegin() may be inside an expansion.
+ return Lexer::makeFileCharRange(Range, SM, LangOpts);
}