Introduce a CIndex API for lexing the raw tokens within a given source
range. The token-annotation function does nothing, yet.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@94551 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/tools/CIndex/CIndex.cpp b/tools/CIndex/CIndex.cpp
index 03519ad..55061cb 100644
--- a/tools/CIndex/CIndex.cpp
+++ b/tools/CIndex/CIndex.cpp
@@ -876,6 +876,21 @@
   return Str;
 }
 
+CXString CIndexer::createCXString(llvm::StringRef String, bool DupString) {
+  CXString Result;
+  if (DupString || (!String.empty() && String.data()[String.size()] != 0)) {
+    char *Spelling = (char *)malloc(String.size() + 1);
+    memmove(Spelling, String.data(), String.size());
+    Spelling[String.size()] = 0;
+    Result.Spelling = Spelling;
+    Result.MustFreeString = 1;
+  } else {
+    Result.Spelling = String.data();
+    Result.MustFreeString = 0;
+  }
+  return Result;
+}
+
 extern "C" {
 CXIndex clang_createIndex(int excludeDeclarationsFromPCH,
                           int displayDiagnostics) {
@@ -1882,6 +1897,183 @@
 } // end: extern "C"
 
 //===----------------------------------------------------------------------===//
+// Token-based Operations.
+//===----------------------------------------------------------------------===//
+
+/* CXToken layout:
+ *   int_data[0]: a CXTokenKind
+ *   int_data[1]: starting token location
+ *   int_data[2]: token length
+ *   int_data[3]: reserved
+ *   ptr_data: for identifiers and keywords, an IdentifierInfo*. 
+ *   otherwise unused.
+ */
+extern "C" {
+
+CXTokenKind clang_getTokenKind(CXToken CXTok) {
+  return static_cast<CXTokenKind>(CXTok.int_data[0]);
+}
+
+CXString clang_getTokenSpelling(CXTranslationUnit TU, CXToken CXTok) {
+  switch (clang_getTokenKind(CXTok)) {
+  case CXToken_Identifier:
+  case CXToken_Keyword:
+    // We know we have an IdentifierInfo*, so use that.
+    return CIndexer::createCXString(
+              static_cast<IdentifierInfo *>(CXTok.ptr_data)->getNameStart());
+
+  case CXToken_Literal: {
+    // We have stashed the starting pointer in the ptr_data field. Use it.
+    const char *Text = static_cast<const char *>(CXTok.ptr_data);
+    return CIndexer::createCXString(llvm::StringRef(Text, CXTok.int_data[2]), 
+                                    true);
+  }
+      
+  case CXToken_Punctuation:
+  case CXToken_Comment:
+    break;
+  }
+  
+  // We have to find the starting buffer pointer the hard way, by 
+  // deconstructing the source location.
+  ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+  if (!CXXUnit)
+    return CIndexer::createCXString("");
+  
+  SourceLocation Loc = SourceLocation::getFromRawEncoding(CXTok.int_data[1]);
+  std::pair<FileID, unsigned> LocInfo
+    = CXXUnit->getSourceManager().getDecomposedLoc(Loc);
+  std::pair<const char *,const char *> Buffer
+    = CXXUnit->getSourceManager().getBufferData(LocInfo.first);
+
+  return CIndexer::createCXString(llvm::StringRef(Buffer.first+LocInfo.second,
+                                                  CXTok.int_data[2]), 
+                                  true);
+}
+ 
+CXSourceLocation clang_getTokenLocation(CXTranslationUnit TU, CXToken CXTok) {
+  ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+  if (!CXXUnit)
+    return clang_getNullLocation();
+  
+  return cxloc::translateSourceLocation(CXXUnit->getASTContext(),
+                        SourceLocation::getFromRawEncoding(CXTok.int_data[1]));
+}
+
+CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) {
+  ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+  if (!CXXUnit) {
+    CXSourceRange Result = { 0, 0, 0 };
+    return Result;
+  }
+  
+  return cxloc::translateSourceRange(CXXUnit->getASTContext(), 
+                        SourceLocation::getFromRawEncoding(CXTok.int_data[1]));
+}
+  
+void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
+                    CXToken **Tokens, unsigned *NumTokens) {
+  if (Tokens)
+    *Tokens = 0;
+  if (NumTokens)
+    *NumTokens = 0;
+  
+  ASTUnit *CXXUnit = static_cast<ASTUnit *>(TU);
+  if (!CXXUnit || !Tokens || !NumTokens)
+    return;
+  
+  SourceRange R = cxloc::translateSourceRange(Range);
+  if (R.isInvalid())
+    return;
+  
+  SourceManager &SourceMgr = CXXUnit->getSourceManager();
+  std::pair<FileID, unsigned> BeginLocInfo
+    = SourceMgr.getDecomposedLoc(R.getBegin());
+  std::pair<FileID, unsigned> EndLocInfo
+    = SourceMgr.getDecomposedLoc(R.getEnd());
+  
+  // Cannot tokenize across files.
+  if (BeginLocInfo.first != EndLocInfo.first)
+    return;
+  
+  // Create a lexer 
+  std::pair<const char *,const char *> Buffer
+    = SourceMgr.getBufferData(BeginLocInfo.first);
+  Lexer Lex(SourceMgr.getLocForStartOfFile(BeginLocInfo.first),
+            CXXUnit->getASTContext().getLangOptions(),
+            Buffer.first, Buffer.first + BeginLocInfo.second, Buffer.second);
+  Lex.SetCommentRetentionState(true);
+  
+  // Lex tokens until we hit the end of the range.
+  const char *EffectiveBufferEnd = Buffer.first + EndLocInfo.second;
+  llvm::SmallVector<CXToken, 32> CXTokens;
+  Token Tok;
+  do {
+    // Lex the next token
+    Lex.LexFromRawLexer(Tok);
+    if (Tok.is(tok::eof))
+      break;
+    
+    // Initialize the CXToken.
+    CXToken CXTok;
+    
+    //   - Common fields
+    CXTok.int_data[1] = Tok.getLocation().getRawEncoding();
+    CXTok.int_data[2] = Tok.getLength();
+    CXTok.int_data[3] = 0;
+    
+    //   - Kind-specific fields
+    if (Tok.isLiteral()) {
+      CXTok.int_data[0] = CXToken_Literal;
+      CXTok.ptr_data = (void *)Tok.getLiteralData();
+    } else if (Tok.is(tok::identifier)) {
+      // Lookup the identifier to determine whether we have a 
+      std::pair<FileID, unsigned> LocInfo
+        = SourceMgr.getDecomposedLoc(Tok.getLocation());
+      const char *StartPos 
+        = CXXUnit->getSourceManager().getBufferData(LocInfo.first).first + 
+          LocInfo.second;
+      IdentifierInfo *II
+        = CXXUnit->getPreprocessor().LookUpIdentifierInfo(Tok, StartPos);
+      CXTok.int_data[0] = II->getTokenID() == tok::identifier?
+                               CXToken_Identifier
+                             : CXToken_Keyword;
+      CXTok.ptr_data = II;
+    } else if (Tok.is(tok::comment)) {
+      CXTok.int_data[0] = CXToken_Comment;
+      CXTok.ptr_data = 0;
+    } else {
+      CXTok.int_data[0] = CXToken_Punctuation;
+      CXTok.ptr_data = 0;
+    }
+    CXTokens.push_back(CXTok);
+  } while (Lex.getBufferLocation() <= EffectiveBufferEnd);
+  
+  if (CXTokens.empty())
+    return;
+  
+  *Tokens = (CXToken *)malloc(sizeof(CXToken) * CXTokens.size());
+  memmove(*Tokens, CXTokens.data(), sizeof(CXToken) * CXTokens.size());
+  *NumTokens = CXTokens.size();
+}
+  
+void clang_annotateTokens(CXTranslationUnit TU,
+                          CXToken *Tokens, unsigned NumTokens,
+                          CXCursor *Cursors) {
+  // FIXME: Actually perform some meaningful lookup here.
+  for (unsigned I = 0; I != NumTokens; ++I)
+    Cursors[I] = clang_getNullCursor();
+}
+
+void clang_disposeTokens(CXTranslationUnit TU, 
+                         CXToken *Tokens, unsigned NumTokens) {
+  if (Tokens)
+    free(Tokens);
+}
+  
+} // end: extern "C"
+
+//===----------------------------------------------------------------------===//
 // CXString Operations.
 //===----------------------------------------------------------------------===//