Introduce a CIndex API for lexing the raw tokens within a given source
range. The token-annotation function does nothing, yet.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@94551 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/tools/CIndex/CIndex.cpp b/tools/CIndex/CIndex.cpp
index 03519ad..55061cb 100644
--- a/tools/CIndex/CIndex.cpp
+++ b/tools/CIndex/CIndex.cpp
@@ -876,6 +876,21 @@
return Str;
}
+CXString CIndexer::createCXString(llvm::StringRef String, bool DupString) {
+ CXString Result;
+ if (DupString || (!String.empty() && String.data()[String.size()] != 0)) {
+ char *Spelling = (char *)malloc(String.size() + 1);
+ memmove(Spelling, String.data(), String.size());
+ Spelling[String.size()] = 0;
+ Result.Spelling = Spelling;
+ Result.MustFreeString = 1;
+ } else {
+ Result.Spelling = String.data();
+ Result.MustFreeString = 0;
+ }
+ return Result;
+}
+
extern "C" {
CXIndex clang_createIndex(int excludeDeclarationsFromPCH,
int displayDiagnostics) {
@@ -1882,6 +1897,183 @@
} // end: extern "C"
//===----------------------------------------------------------------------===//
+// Token-based Operations.
+//===----------------------------------------------------------------------===//
+
+/* CXToken layout:
+ * int_data[0]: a CXTokenKind
+ * int_data[1]: starting token location
+ * int_data[2]: token length
+ * int_data[3]: reserved
+ * ptr_data: for identifiers and keywords, an IdentifierInfo*.
+ * otherwise unused.
+ */
+extern "C" {
+
+CXTokenKind clang_getTokenKind(CXToken CXTok) {
+ return static_cast<CXTokenKind>(CXTok.int_data[0]);
+}
+
+CXString clang_getTokenSpelling(CXTranslationUnit TU, CXToken CXTok) {
+ switch (clang_getTokenKind(CXTok)) {
+ case CXToken_Identifier:
+ case CXToken_Keyword:
+ // We know we have an IdentifierInfo*, so use that.
+ return CIndexer::createCXString(
+ static_cast<IdentifierInfo *>(CXTok.ptr_data)->getNameStart());
+
+ case CXToken_Literal: {
+ // We have stashed the starting pointer in the ptr_data field. Use it.
+ const char *Text = static_cast<const char *>(CXTok.ptr_data);
+ return CIndexer::createCXString(llvm::StringRef(Text, CXTok.int_data[2]),
+ true);
+ }
+
+ case CXToken_Punctuation:
+ case CXToken_Comment:
+ break;
+ }
+
+ // We have to find the starting buffer pointer the hard way, by
+ // deconstructing the source location.
+ ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+ if (!CXXUnit)
+ return CIndexer::createCXString("");
+
+ SourceLocation Loc = SourceLocation::getFromRawEncoding(CXTok.int_data[1]);
+ std::pair<FileID, unsigned> LocInfo
+ = CXXUnit->getSourceManager().getDecomposedLoc(Loc);
+ std::pair<const char *,const char *> Buffer
+ = CXXUnit->getSourceManager().getBufferData(LocInfo.first);
+
+ return CIndexer::createCXString(llvm::StringRef(Buffer.first+LocInfo.second,
+ CXTok.int_data[2]),
+ true);
+}
+
+CXSourceLocation clang_getTokenLocation(CXTranslationUnit TU, CXToken CXTok) {
+ ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+ if (!CXXUnit)
+ return clang_getNullLocation();
+
+ return cxloc::translateSourceLocation(CXXUnit->getASTContext(),
+ SourceLocation::getFromRawEncoding(CXTok.int_data[1]));
+}
+
+CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) {
+ ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+ if (!CXXUnit) {
+ CXSourceRange Result = { 0, 0, 0 };
+ return Result;
+ }
+
+ return cxloc::translateSourceRange(CXXUnit->getASTContext(),
+ SourceLocation::getFromRawEncoding(CXTok.int_data[1]));
+}
+
+void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
+ CXToken **Tokens, unsigned *NumTokens) {
+ if (Tokens)
+ *Tokens = 0;
+ if (NumTokens)
+ *NumTokens = 0;
+
+ ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+ if (!CXXUnit || !Tokens || !NumTokens)
+ return;
+
+ SourceRange R = cxloc::translateSourceRange(Range);
+ if (R.isInvalid())
+ return;
+
+ SourceManager &SourceMgr = CXXUnit->getSourceManager();
+ std::pair<FileID, unsigned> BeginLocInfo
+ = SourceMgr.getDecomposedLoc(R.getBegin());
+ std::pair<FileID, unsigned> EndLocInfo
+ = SourceMgr.getDecomposedLoc(R.getEnd());
+
+ // Cannot tokenize across files.
+ if (BeginLocInfo.first != EndLocInfo.first)
+ return;
+
+ // Create a lexer
+ std::pair<const char *,const char *> Buffer
+ = SourceMgr.getBufferData(BeginLocInfo.first);
+ Lexer Lex(SourceMgr.getLocForStartOfFile(BeginLocInfo.first),
+ CXXUnit->getASTContext().getLangOptions(),
+ Buffer.first, Buffer.first + BeginLocInfo.second, Buffer.second);
+ Lex.SetCommentRetentionState(true);
+
+ // Lex tokens until we hit the end of the range.
+ const char *EffectiveBufferEnd = Buffer.first + EndLocInfo.second;
+ llvm::SmallVector<CXToken, 32> CXTokens;
+ Token Tok;
+ do {
+ // Lex the next token
+ Lex.LexFromRawLexer(Tok);
+ if (Tok.is(tok::eof))
+ break;
+
+ // Initialize the CXToken.
+ CXToken CXTok;
+
+ // - Common fields
+ CXTok.int_data[1] = Tok.getLocation().getRawEncoding();
+ CXTok.int_data[2] = Tok.getLength();
+ CXTok.int_data[3] = 0;
+
+ // - Kind-specific fields
+ if (Tok.isLiteral()) {
+ CXTok.int_data[0] = CXToken_Literal;
+ CXTok.ptr_data = (void *)Tok.getLiteralData();
+ } else if (Tok.is(tok::identifier)) {
+ // Lookup the identifier to determine whether we have a
+ std::pair<FileID, unsigned> LocInfo
+ = SourceMgr.getDecomposedLoc(Tok.getLocation());
+ const char *StartPos
+ = CXXUnit->getSourceManager().getBufferData(LocInfo.first).first +
+ LocInfo.second;
+ IdentifierInfo *II
+ = CXXUnit->getPreprocessor().LookUpIdentifierInfo(Tok, StartPos);
+ CXTok.int_data[0] = II->getTokenID() == tok::identifier?
+ CXToken_Identifier
+ : CXToken_Keyword;
+ CXTok.ptr_data = II;
+ } else if (Tok.is(tok::comment)) {
+ CXTok.int_data[0] = CXToken_Comment;
+ CXTok.ptr_data = 0;
+ } else {
+ CXTok.int_data[0] = CXToken_Punctuation;
+ CXTok.ptr_data = 0;
+ }
+ CXTokens.push_back(CXTok);
+ } while (Lex.getBufferLocation() <= EffectiveBufferEnd);
+
+ if (CXTokens.empty())
+ return;
+
+ *Tokens = (CXToken *)malloc(sizeof(CXToken) * CXTokens.size());
+ memmove(*Tokens, CXTokens.data(), sizeof(CXToken) * CXTokens.size());
+ *NumTokens = CXTokens.size();
+}
+
+void clang_annotateTokens(CXTranslationUnit TU,
+ CXToken *Tokens, unsigned NumTokens,
+ CXCursor *Cursors) {
+ // FIXME: Actually perform some meaningful lookup here.
+ for (unsigned I = 0; I != NumTokens; ++I)
+ Cursors[I] = clang_getNullCursor();
+}
+
+void clang_disposeTokens(CXTranslationUnit TU,
+ CXToken *Tokens, unsigned NumTokens) {
+ if (Tokens)
+ free(Tokens);
+}
+
+} // end: extern "C"
+
+//===----------------------------------------------------------------------===//
// CXString Operations.
//===----------------------------------------------------------------------===//
diff --git a/tools/CIndex/CIndex.exports b/tools/CIndex/CIndex.exports
index b2ec58e..fa141fc 100644
--- a/tools/CIndex/CIndex.exports
+++ b/tools/CIndex/CIndex.exports
@@ -1,3 +1,4 @@
+_clang_annotateTokens
_clang_codeComplete
_clang_createIndex
_clang_createTranslationUnit
@@ -5,6 +6,7 @@
_clang_disposeCodeCompleteResults
_clang_disposeIndex
_clang_disposeString
+_clang_disposeTokens
_clang_disposeTranslationUnit
_clang_equalCursors
_clang_equalLocations
@@ -35,6 +37,10 @@
_clang_getRange
_clang_getRangeEnd
_clang_getRangeStart
+_clang_getTokenExtent
+_clang_getTokenKind
+_clang_getTokenLocation
+_clang_getTokenSpelling
_clang_getTranslationUnitCursor
_clang_getTranslationUnitSpelling
_clang_isCursorDefinition
@@ -45,4 +51,5 @@
_clang_isStatement
_clang_isTranslationUnit
_clang_setUseExternalASTGeneration
+_clang_tokenize
_clang_visitChildren
diff --git a/tools/CIndex/CIndexer.h b/tools/CIndex/CIndexer.h
index d01454f..aa63ec0 100644
--- a/tools/CIndex/CIndexer.h
+++ b/tools/CIndex/CIndexer.h
@@ -18,6 +18,7 @@
#include "clang-c/Index.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Frontend/ASTUnit.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/System/Path.h"
#include <vector>
@@ -76,6 +77,8 @@
std::string getClangResourcesPath();
static CXString createCXString(const char *String, bool DupString = false);
+ static CXString createCXString(llvm::StringRef String,
+ bool DupString = false);
};
namespace clang {
diff --git a/tools/CIndex/CXSourceLocation.h b/tools/CIndex/CXSourceLocation.h
index 0eab273..1f15f08 100644
--- a/tools/CIndex/CXSourceLocation.h
+++ b/tools/CIndex/CXSourceLocation.h
@@ -38,8 +38,8 @@
static inline CXSourceRange translateSourceRange(ASTContext &Context,
SourceRange R) {
CXSourceRange Result = { &Context,
- R.getBegin().getRawEncoding(),
- R.getEnd().getRawEncoding() };
+ R.getBegin().getRawEncoding(),
+ R.getEnd().getRawEncoding() };
return Result;
}
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 4ef3904..222ffba 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -481,42 +481,62 @@
on failure. If successful, the pointer *filename will contain newly-allocated
memory (that will be owned by the caller) to store the file name. */
int parse_file_line_column(const char *input, char **filename, unsigned *line,
- unsigned *column) {
+ unsigned *column, unsigned *second_line,
+ unsigned *second_column) {
/* Find the second colon. */
- const char *second_colon = strrchr(input, ':'), *first_colon;
+ const char *last_colon = strrchr(input, ':');
+ unsigned values[4], i;
+ unsigned num_values = (second_line && second_column)? 4 : 2;
+
char *endptr = 0;
- if (!second_colon || second_colon == input) {
- fprintf(stderr, "could not parse filename:line:column in '%s'\n", input);
+ if (!last_colon || last_colon == input) {
+ if (num_values == 4)
+ fprintf(stderr, "could not parse filename:line:column:line:column in "
+ "'%s'\n", input);
+ else
+ fprintf(stderr, "could not parse filename:line:column in '%s'\n", input);
return 1;
}
- /* Parse the column number. */
- *column = strtol(second_colon + 1, &endptr, 10);
- if (*endptr != 0) {
- fprintf(stderr, "could not parse column in '%s'\n", input);
- return 1;
+ for (i = 0; i != num_values; ++i) {
+ const char *prev_colon;
+
+ /* Parse the next line or column. */
+ values[num_values - i - 1] = strtol(last_colon + 1, &endptr, 10);
+ if (*endptr != 0 && *endptr != ':') {
+ fprintf(stderr, "could not parse %s in '%s'\n",
+ (i % 2 ? "column" : "line"), input);
+ return 1;
+ }
+
+ if (i + 1 == num_values)
+ break;
+
+ /* Find the previous colon. */
+ prev_colon = last_colon - 1;
+ while (prev_colon != input && *prev_colon != ':')
+ --prev_colon;
+ if (prev_colon == input) {
+ fprintf(stderr, "could not parse %s in '%s'\n",
+ (i % 2 == 0? "column" : "line"), input);
+ return 1;
+ }
+
+ last_colon = prev_colon;
}
- /* Find the first colon. */
- first_colon = second_colon - 1;
- while (first_colon != input && *first_colon != ':')
- --first_colon;
- if (first_colon == input) {
- fprintf(stderr, "could not parse line in '%s'\n", input);
- return 1;
- }
-
- /* Parse the line number. */
- *line = strtol(first_colon + 1, &endptr, 10);
- if (*endptr != ':') {
- fprintf(stderr, "could not parse line in '%s'\n", input);
- return 1;
- }
+ *line = values[0];
+ *column = values[1];
+ if (second_line && second_column) {
+ *second_line = values[2];
+ *second_column = values[3];
+ }
+
/* Copy the file name. */
- *filename = (char*)malloc(first_colon - input + 1);
- memcpy(*filename, input, first_colon - input);
- (*filename)[first_colon - input] = 0;
+ *filename = (char*)malloc(last_colon - input + 1);
+ memcpy(*filename, input, last_colon - input);
+ (*filename)[last_colon - input] = 0;
return 0;
}
@@ -595,7 +615,8 @@
CXCodeCompleteResults *results = 0;
input += strlen("-code-completion-at=");
- if ((errorCode = parse_file_line_column(input, &filename, &line, &column)))
+ if ((errorCode = parse_file_line_column(input, &filename, &line, &column,
+ 0, 0)))
return errorCode;
if (parse_remapped_files(argc, argv, 2, &unsaved_files, &num_unsaved_files))
@@ -650,7 +671,7 @@
const char *input = argv[Loc + 1] + strlen("-cursor-at=");
if ((errorCode = parse_file_line_column(input, &Locations[Loc].filename,
&Locations[Loc].line,
- &Locations[Loc].column)))
+ &Locations[Loc].column, 0, 0)))
return errorCode;
}
@@ -689,6 +710,104 @@
return 0;
}
+int perform_token_annotation(int argc, const char **argv) {
+ const char *input = argv[1];
+ char *filename = 0;
+ unsigned line, second_line;
+ unsigned column, second_column;
+ CXIndex CIdx;
+ CXTranslationUnit TU = 0;
+ int errorCode;
+ struct CXUnsavedFile *unsaved_files = 0;
+ int num_unsaved_files = 0;
+ CXToken *tokens;
+ unsigned num_tokens;
+ CXSourceRange range;
+ CXSourceLocation startLoc, endLoc;
+ CXFile file = 0;
+ CXCursor *cursors = 0;
+ unsigned i;
+
+ input += strlen("-test-annotate-tokens=");
+ if ((errorCode = parse_file_line_column(input, &filename, &line, &column,
+ &second_line, &second_column)))
+ return errorCode;
+
+ if (parse_remapped_files(argc, argv, 2, &unsaved_files, &num_unsaved_files))
+ return -1;
+
+ CIdx = clang_createIndex(0, 0);
+ TU = clang_createTranslationUnitFromSourceFile(CIdx, argv[argc - 1],
+ argc - num_unsaved_files - 3,
+ argv + num_unsaved_files + 2,
+ num_unsaved_files,
+ unsaved_files);
+ if (!TU) {
+ fprintf(stderr, "unable to parse input\n");
+ clang_disposeIndex(CIdx);
+ free(filename);
+ free_remapped_files(unsaved_files, num_unsaved_files);
+ return -1;
+ }
+ errorCode = 0;
+
+ file = clang_getFile(TU, filename);
+ if (!file) {
+ fprintf(stderr, "file %s is not in this translation unit\n", filename);
+ errorCode = -1;
+ goto teardown;
+ }
+
+ startLoc = clang_getLocation(TU, file, line, column);
+ if (clang_equalLocations(clang_getNullLocation(), startLoc)) {
+ fprintf(stderr, "invalid source location %s:%d:%d\n", filename, line,
+ column);
+ errorCode = -1;
+ goto teardown;
+ }
+
+ endLoc = clang_getLocation(TU, file, second_line, second_column);
+ if (clang_equalLocations(clang_getNullLocation(), endLoc)) {
+ fprintf(stderr, "invalid source location %s:%d:%d\n", filename,
+ second_line, second_column);
+ errorCode = -1;
+ goto teardown;
+ }
+
+ range = clang_getRange(startLoc, endLoc);
+ clang_tokenize(TU, range, &tokens, &num_tokens);
+ cursors = (CXCursor *)malloc(num_tokens * sizeof(CXCursor));
+ clang_annotateTokens(TU, tokens, num_tokens, cursors);
+ for (i = 0; i != num_tokens; ++i) {
+ const char *kind = "<unknown>";
+ CXString spelling = clang_getTokenSpelling(TU, tokens[i]);
+ CXSourceRange extent = clang_getTokenExtent(TU, tokens[i]);
+ unsigned start_line, start_column, end_line, end_column;
+
+ switch (clang_getTokenKind(tokens[i])) {
+ case CXToken_Punctuation: kind = "Punctuation"; break;
+ case CXToken_Keyword: kind = "Keyword"; break;
+ case CXToken_Identifier: kind = "Identifier"; break;
+ case CXToken_Literal: kind = "Literal"; break;
+ case CXToken_Comment: kind = "Comment"; break;
+ }
+ clang_getInstantiationLocation(clang_getRangeStart(extent),
+ 0, &start_line, &start_column);
+ clang_getInstantiationLocation(clang_getRangeEnd(extent),
+ 0, &end_line, &end_column);
+ printf("%s: \"%s\" [%d:%d - %d:%d]\n", kind, clang_getCString(spelling),
+ start_line, start_column, end_line, end_column);
+ }
+ free(cursors);
+
+ teardown:
+ clang_disposeTranslationUnit(TU);
+ clang_disposeIndex(CIdx);
+ free(filename);
+ free_remapped_files(unsaved_files, num_unsaved_files);
+ return errorCode;
+}
+
/******************************************************************************/
/* Command line processing. */
/******************************************************************************/
@@ -712,8 +831,9 @@
" c-index-test -test-load-tu-usrs <AST file> <symbol filter> "
"[FileCheck prefix]\n"
" c-index-test -test-load-source <symbol filter> {<args>}*\n"
- " c-index-test -test-load-source-usrs <symbol filter> {<args>}*\n\n");
+ " c-index-test -test-load-source-usrs <symbol filter> {<args>}*\n");
fprintf(stderr,
+ " c-index-test -test-annotate-tokens=<range> {<args>}* \n\n"
" <symbol filter> values:\n%s",
" all - load all symbols, including those from PCH\n"
" local - load all symbols except those in PCH\n"
@@ -743,7 +863,8 @@
else if (argc >= 4 && strcmp(argv[1], "-test-file-scan") == 0)
return perform_file_scan(argv[2], argv[3],
argc >= 5 ? argv[4] : 0);
-
+ else if (argc > 2 && strstr(argv[1], "-test-annotate-tokens=") == argv[1])
+ return perform_token_annotation(argc, argv);
print_usage();
return 1;
}