Implement clang_annotateTokens(), which associates cursors with each
of the tokens within a raw token stream. This does not even attempt to
handle macros yet.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@94561 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index ff0a0e1..b041dcf 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -861,7 +861,11 @@
  */
 
 /**
- * \defgroup CINDEX_LEX Lexing and syntactic analysis
+ * \defgroup CINDEX_LEX Token extraction and manipulation
+ *
+ * The routines in this group provide access to the tokens within a
+ * translation unit, along with a semantic mapping of those tokens to
+ * their corresponding cursors.
  *
  * @{
  */
@@ -876,7 +880,7 @@
   CXToken_Punctuation,
   
   /**
-   * \brief A a language keyword.
+   * \brief A language keyword.
    */
   CXToken_Keyword,
   
@@ -952,9 +956,22 @@
  * \brief Annotate the given set of tokens by providing cursors for each token
  * that can be mapped to a specific entity within the abstract syntax tree.
  *
- * This token-annotation routine is equivalent to invoking clang_getCursor() 
- * for the source locations of each of the tokens, then accepting only those
- * cursors that refer to a specific token.
+ * This token-annotation routine is equivalent to invoking
+ * clang_getCursor() for the source locations of each of the
+ * tokens. The cursors provided are filtered, so that only those
+ * cursors that have a direct correspondence to the token are
+ * accepted. For example, given a function call \c f(x),
+ * clang_getCursor() would provide the following cursors:
+ *
+ *   * when the cursor is over the 'f', a DeclRefExpr cursor referring to 'f'.
+ *   * when the cursor is over the '(' or the ')', a CallExpr referring to 'f'.
+ *   * when the cursor is over the 'x', a DeclRefExpr cursor referring to 'x'.
+ *
+ * Only the first and last of these cursors will occur within the
+ * annotate, since the tokens "f" and "x' directly refer to a function
+ * and a variable, respectively, but the parentheses are just a small
+ * part of the full syntax of the function call expression, which is
+ * not provided as an annotation.
  *
  * \param TU the translation unit that owns the given tokens.
  *
diff --git a/test/Index/annotate-tokens.c b/test/Index/annotate-tokens.c
index 6d2b4d2..b1af2ff 100644
--- a/test/Index/annotate-tokens.c
+++ b/test/Index/annotate-tokens.c
@@ -10,32 +10,32 @@
 }
 
 // RUN: c-index-test -test-annotate-tokens=%s:4:1:9:32 %s | FileCheck %s
-// CHECK: Identifier: "T" [4:3 - 4:3]
+// CHECK: Identifier: "T" [4:3 - 4:3] TypeRef=T:1:13
 // CHECK: Punctuation: "*" [4:4 - 4:4]
-// CHECK: Identifier: "t_ptr" [4:6 - 4:10]
+// CHECK: Identifier: "t_ptr" [4:6 - 4:10] VarDecl=t_ptr:4:6 (Definition)
 // CHECK: Punctuation: "=" [4:12 - 4:12]
 // CHECK: Punctuation: "(" [4:14 - 4:14]
-// CHECK: Identifier: "T" [4:15 - 4:15]
+// CHECK: Identifier: "T" [4:15 - 4:15] TypeRef=T:1:13
 // CHECK: Punctuation: "*" [4:17 - 4:17]
 // CHECK: Punctuation: ")" [4:18 - 4:18]
-// CHECK: Identifier: "ptr" [4:19 - 4:21]
+// CHECK: Identifier: "ptr" [4:19 - 4:21] DeclRefExpr=ptr:3:14
 // CHECK: Punctuation: ";" [4:22 - 4:22]
 // CHECK: Punctuation: "(" [5:3 - 5:3]
 // CHECK: Keyword: "void" [5:4 - 5:7]
 // CHECK: Punctuation: ")" [5:8 - 5:8]
 // CHECK: Keyword: "sizeof" [5:9 - 5:14]
 // CHECK: Punctuation: "(" [5:15 - 5:15]
-// CHECK: Identifier: "T" [5:16 - 5:16]
+// CHECK: Identifier: "T" [5:16 - 5:16] TypeRef=T:1:13
 // CHECK: Punctuation: ")" [5:17 - 5:17]
 // CHECK: Punctuation: ";" [5:18 - 5:18]
 // CHECK: Comment: "/* A comment */" [6:3 - 6:17]
 // CHECK: Keyword: "struct" [7:3 - 7:8]
-// CHECK: Identifier: "X" [7:10 - 7:10]
-// CHECK: Identifier: "x" [7:12 - 7:12]
+// CHECK: Identifier: "X" [7:10 - 7:10] TypeRef=struct X:2:8
+// CHECK: Identifier: "x" [7:12 - 7:12] VarDecl=x:7:12 (Definition)
 // CHECK: Punctuation: "=" [7:14 - 7:14]
 // CHECK: Punctuation: "(" [7:16 - 7:16]
 // CHECK: Keyword: "struct" [7:17 - 7:22]
-// CHECK: Identifier: "X" [7:24 - 7:24]
+// CHECK: Identifier: "X" [7:24 - 7:24] TypeRef=struct X:2:8
 // CHECK: Punctuation: ")" [7:25 - 7:25]
 // CHECK: Punctuation: "{" [7:26 - 7:26]
 // CHECK: Literal: "1" [7:27 - 7:27]
@@ -45,18 +45,18 @@
 // CHECK: Punctuation: ";" [7:32 - 7:32]
 // CHECK: Keyword: "void" [8:3 - 8:6]
 // CHECK: Punctuation: "*" [8:8 - 8:8]
-// CHECK: Identifier: "xx" [8:9 - 8:10]
+// CHECK: Identifier: "xx" [8:9 - 8:10] VarDecl=xx:8:9 (Definition)
 // CHECK: Punctuation: "=" [8:12 - 8:12]
-// CHECK: Identifier: "ptr" [8:14 - 8:16]
+// CHECK: Identifier: "ptr" [8:14 - 8:16] DeclRefExpr=ptr:3:14
 // CHECK: Punctuation: "?" [8:18 - 8:18]
 // CHECK: Punctuation: ":" [8:20 - 8:20]
 // CHECK: Punctuation: "&" [8:22 - 8:22]
-// CHECK: Identifier: "x" [8:23 - 8:23]
+// CHECK: Identifier: "x" [8:23 - 8:23] DeclRefExpr=x:7:12
 // CHECK: Punctuation: ";" [8:24 - 8:24]
 // CHECK: Keyword: "const" [9:3 - 9:7]
 // CHECK: Keyword: "char" [9:9 - 9:12]
 // CHECK: Punctuation: "*" [9:14 - 9:14]
-// CHECK: Identifier: "hello" [9:16 - 9:20]
+// CHECK: Identifier: "hello" [9:16 - 9:20] VarDecl=hello:9:16 (Definition)
 // CHECK: Punctuation: "=" [9:22 - 9:22]
 // CHECK: Literal: ""Hello"" [9:24 - 9:30]
 // CHECK: Punctuation: ";" [9:31 - 9:31]
diff --git a/tools/CIndex/CIndex.cpp b/tools/CIndex/CIndex.cpp
index 55061cb..3a59779 100644
--- a/tools/CIndex/CIndex.cpp
+++ b/tools/CIndex/CIndex.cpp
@@ -2056,19 +2056,86 @@
   memmove(*Tokens, CXTokens.data(), sizeof(CXToken) * CXTokens.size());
   *NumTokens = CXTokens.size();
 }
+
+typedef llvm::DenseMap<unsigned, CXCursor> AnnotateTokensData;
+
+enum CXChildVisitResult AnnotateTokensVisitor(CXCursor cursor, 
+                                              CXCursor parent, 
+                                              CXClientData client_data) {
+  AnnotateTokensData *Data = static_cast<AnnotateTokensData *>(client_data);
+
+  // We only annotate the locations of declarations, simple
+  // references, and expressions which directly reference something.
+  CXCursorKind Kind = clang_getCursorKind(cursor);
+  if (clang_isDeclaration(Kind) || clang_isReference(Kind)) {
+    // Okay: We can annotate the location of this declaration with the
+    // declaration or reference
+  } else if (clang_isExpression(cursor.kind)) {
+    if (Kind != CXCursor_DeclRefExpr &&
+        Kind != CXCursor_MemberRefExpr &&
+        Kind != CXCursor_ObjCMessageExpr)
+      return CXChildVisit_Recurse;
+
+    CXCursor Referenced = clang_getCursorReferenced(cursor);
+    if (Referenced == cursor || Referenced == clang_getNullCursor())
+      return CXChildVisit_Recurse;
+    
+    // Okay: we can annotate the location of this expression
+  } else {
+    // Nothing to annotate
+    return CXChildVisit_Recurse;
+  }
   
+  CXSourceLocation Loc = clang_getCursorLocation(cursor);
+  (*Data)[Loc.int_data] = cursor;
+  return CXChildVisit_Recurse;
+}
+
 void clang_annotateTokens(CXTranslationUnit TU,
                           CXToken *Tokens, unsigned NumTokens,
                           CXCursor *Cursors) {
-  // FIXME: Actually perform some meaningful lookup here.
+  if (NumTokens == 0)
+    return;
+
+  // Any token we don't specifically annotate will have a NULL cursor.
   for (unsigned I = 0; I != NumTokens; ++I)
     Cursors[I] = clang_getNullCursor();
+
+  ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+  if (!CXXUnit || !Tokens)
+    return;
+
+  // Annotate all of the source locations in the region of interest that map 
+  SourceRange RegionOfInterest;
+  RegionOfInterest.setBegin(
+        cxloc::translateSourceLocation(clang_getTokenLocation(TU, Tokens[0])));
+  SourceLocation End
+    = cxloc::translateSourceLocation(clang_getTokenLocation(TU, 
+                                                     Tokens[NumTokens - 1]));
+  RegionOfInterest.setEnd(CXXUnit->getPreprocessor().getLocForEndOfToken(End, 
+                                                                         1));
+  // FIXME: Would be great to have a "hint" cursor, then walk from that
+  // hint cursor upward until we find a cursor whose source range encloses
+  // the region of interest, rather than starting from the translation unit.
+  AnnotateTokensData Annotated;
+  CXCursor Parent = clang_getTranslationUnitCursor(CXXUnit);
+  CursorVisitor AnnotateVis(CXXUnit, AnnotateTokensVisitor, &Annotated, 
+                            Decl::MaxPCHLevel, RegionOfInterest);
+  AnnotateVis.VisitChildren(Parent);
+
+  for (unsigned I = 0; I != NumTokens; ++I) {
+    // Determine whether we saw a cursor at this token's location.
+    AnnotateTokensData::iterator Pos = Annotated.find(Tokens[I].int_data[1]);
+    if (Pos == Annotated.end())
+      continue;
+
+    Cursors[I] = Pos->second;
+  }
 }
 
 void clang_disposeTokens(CXTranslationUnit TU, 
                          CXToken *Tokens, unsigned NumTokens) {
-  if (Tokens)
-    free(Tokens);
+  free(Tokens);
 }
   
 } // end: extern "C"
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 1015aa5..243b873 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -810,8 +810,13 @@
                                    0, &start_line, &start_column);
     clang_getInstantiationLocation(clang_getRangeEnd(extent),
                                    0, &end_line, &end_column);
-    printf("%s: \"%s\" [%d:%d - %d:%d]\n", kind, clang_getCString(spelling),
+    printf("%s: \"%s\" [%d:%d - %d:%d]", kind, clang_getCString(spelling),
            start_line, start_column, end_line, end_column);
+    if (!clang_isInvalid(cursors[i].kind)) {
+      printf(" ");
+      PrintCursor(cursors[i]);
+    }
+    printf("\n");
   }
   free(cursors);