blob: a1deb3d256dd485e6c03e6480903843a90e54c5e [file] [log] [blame]
Chris Lattner59907c42007-08-10 20:18:51 +00001//===--- SemaChecking.cpp - Extra Semantic Checking -----------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Ted Kremenek and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements extra semantic analysis beyond what is enforced
11// by the C type system.
12//
13//===----------------------------------------------------------------------===//
14
15#include "Sema.h"
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/Decl.h"
18#include "clang/AST/Expr.h"
Ted Kremenek23245122007-08-20 16:18:38 +000019#include "clang/AST/ExprCXX.h"
Chris Lattner59907c42007-08-10 20:18:51 +000020#include "clang/Lex/Preprocessor.h"
21#include "clang/Lex/LiteralSupport.h"
22#include "clang/Basic/SourceManager.h"
23#include "clang/Basic/Diagnostic.h"
24#include "clang/Basic/LangOptions.h"
25#include "clang/Basic/TargetInfo.h"
26#include "llvm/ADT/SmallString.h"
27#include "llvm/ADT/StringExtras.h"
28using namespace clang;
29
30/// CheckFunctionCall - Check a direct function call for various correctness
31/// and safety properties not strictly enforced by the C type system.
Anders Carlsson71993dd2007-08-17 05:31:46 +000032bool
Ted Kremenek71895b92007-08-14 17:39:48 +000033Sema::CheckFunctionCall(Expr *Fn,
34 SourceLocation LParenLoc, SourceLocation RParenLoc,
35 FunctionDecl *FDecl,
Chris Lattner59907c42007-08-10 20:18:51 +000036 Expr** Args, unsigned NumArgsInCall) {
37
38 // Get the IdentifierInfo* for the called function.
39 IdentifierInfo *FnInfo = FDecl->getIdentifier();
40
Anders Carlsson71993dd2007-08-17 05:31:46 +000041 if (FnInfo->getBuiltinID() ==
42 Builtin::BI__builtin___CFStringMakeConstantString) {
43 assert(NumArgsInCall == 1 &&
44 "Wrong number of arguments to builtin CFStringMakeConstantString");
45 return CheckBuiltinCFStringArgument(Args[0]);
46 }
47
Chris Lattner59907c42007-08-10 20:18:51 +000048 // Search the KnownFunctionIDs for the identifier.
49 unsigned i = 0, e = id_num_known_functions;
Ted Kremenek71895b92007-08-14 17:39:48 +000050 for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000051 if (i == e) return false;
Chris Lattner59907c42007-08-10 20:18:51 +000052
53 // Printf checking.
54 if (i <= id_vprintf) {
Ted Kremenek71895b92007-08-14 17:39:48 +000055 // Retrieve the index of the format string parameter and determine
56 // if the function is passed a va_arg argument.
Chris Lattner59907c42007-08-10 20:18:51 +000057 unsigned format_idx = 0;
Ted Kremenek71895b92007-08-14 17:39:48 +000058 bool HasVAListArg = false;
59
Chris Lattner59907c42007-08-10 20:18:51 +000060 switch (i) {
61 default: assert(false && "No format string argument index.");
62 case id_printf: format_idx = 0; break;
63 case id_fprintf: format_idx = 1; break;
64 case id_sprintf: format_idx = 1; break;
65 case id_snprintf: format_idx = 2; break;
Ted Kremenek71895b92007-08-14 17:39:48 +000066 case id_asprintf: format_idx = 1; HasVAListArg = true; break;
67 case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
68 case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
69 case id_vfprintf: format_idx = 1; HasVAListArg = true; break;
70 case id_vsprintf: format_idx = 1; HasVAListArg = true; break;
71 case id_vprintf: format_idx = 0; HasVAListArg = true; break;
72 }
73
74 CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
Ted Kremenek06de2762007-08-17 16:46:58 +000075 FDecl, format_idx, Args, NumArgsInCall);
Chris Lattner59907c42007-08-10 20:18:51 +000076 }
Anders Carlsson71993dd2007-08-17 05:31:46 +000077
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000078 return false;
Anders Carlsson71993dd2007-08-17 05:31:46 +000079}
80
81/// CheckBuiltinCFStringArgument - Checks that the argument to the builtin
82/// CFString constructor is correct
83bool Sema::CheckBuiltinCFStringArgument(Expr* Arg)
84{
85 while (ParenExpr *PE = dyn_cast<ParenExpr>(Arg))
86 Arg = PE->getSubExpr();
87
88 StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);
89
90 if (!Literal || Literal->isWide()) {
91 Diag(Arg->getLocStart(),
92 diag::err_cfstring_literal_not_string_constant,
93 Arg->getSourceRange());
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000094 return true;
Anders Carlsson71993dd2007-08-17 05:31:46 +000095 }
96
97 const char *Data = Literal->getStrData();
98 unsigned Length = Literal->getByteLength();
99
100 for (unsigned i = 0; i < Length; ++i) {
101 if (!isascii(Data[i])) {
102 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
103 diag::warn_cfstring_literal_contains_non_ascii_character,
104 Arg->getSourceRange());
105 break;
106 }
107
108 if (!Data[i]) {
109 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
110 diag::warn_cfstring_literal_contains_nul_character,
111 Arg->getSourceRange());
112 break;
113 }
114 }
115
Anders Carlsson9cdc4d32007-08-17 15:44:17 +0000116 return false;
Chris Lattner59907c42007-08-10 20:18:51 +0000117}
118
119/// CheckPrintfArguments - Check calls to printf (and similar functions) for
Ted Kremenek71895b92007-08-14 17:39:48 +0000120/// correct use of format strings.
121///
122/// HasVAListArg - A predicate indicating whether the printf-like
123/// function is passed an explicit va_arg argument (e.g., vprintf)
124///
125/// format_idx - The index into Args for the format string.
126///
127/// Improper format strings to functions in the printf family can be
128/// the source of bizarre bugs and very serious security holes. A
129/// good source of information is available in the following paper
130/// (which includes additional references):
Chris Lattner59907c42007-08-10 20:18:51 +0000131///
132/// FormatGuard: Automatic Protection From printf Format String
133/// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
Ted Kremenek71895b92007-08-14 17:39:48 +0000134///
135/// Functionality implemented:
136///
137/// We can statically check the following properties for string
138/// literal format strings for non v.*printf functions (where the
139/// arguments are passed directly):
140//
141/// (1) Are the number of format conversions equal to the number of
142/// data arguments?
143///
144/// (2) Does each format conversion correctly match the type of the
145/// corresponding data argument? (TODO)
146///
147/// Moreover, for all printf functions we can:
148///
149/// (3) Check for a missing format string (when not caught by type checking).
150///
151/// (4) Check for no-operation flags; e.g. using "#" with format
152/// conversion 'c' (TODO)
153///
154/// (5) Check the use of '%n', a major source of security holes.
155///
156/// (6) Check for malformed format conversions that don't specify anything.
157///
158/// (7) Check for empty format strings. e.g: printf("");
159///
160/// (8) Check that the format string is a wide literal.
161///
162/// All of these checks can be done by parsing the format string.
163///
164/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
Chris Lattner59907c42007-08-10 20:18:51 +0000165void
Ted Kremenek71895b92007-08-14 17:39:48 +0000166Sema::CheckPrintfArguments(Expr *Fn,
167 SourceLocation LParenLoc, SourceLocation RParenLoc,
168 bool HasVAListArg, FunctionDecl *FDecl,
Ted Kremenek82077102007-08-10 21:21:05 +0000169 unsigned format_idx, Expr** Args,
170 unsigned NumArgsInCall) {
Ted Kremenek71895b92007-08-14 17:39:48 +0000171 // CHECK: printf-like function is called with no format string.
172 if (format_idx >= NumArgsInCall) {
173 Diag(RParenLoc, diag::warn_printf_missing_format_string,
174 Fn->getSourceRange());
175 return;
176 }
177
Chris Lattner59907c42007-08-10 20:18:51 +0000178 // CHECK: format string is not a string literal.
179 //
Ted Kremenek71895b92007-08-14 17:39:48 +0000180 // Dynamically generated format strings are difficult to
181 // automatically vet at compile time. Requiring that format strings
182 // are string literals: (1) permits the checking of format strings by
183 // the compiler and thereby (2) can practically remove the source of
184 // many format string exploits.
Chris Lattner59907c42007-08-10 20:18:51 +0000185 StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
186
Ted Kremenek71895b92007-08-14 17:39:48 +0000187 if (FExpr == NULL) {
188 Diag(Args[format_idx]->getLocStart(),
189 diag::warn_printf_not_string_constant, Fn->getSourceRange());
190 return;
191 }
192
193 // CHECK: is the format string a wide literal?
194 if (FExpr->isWide()) {
195 Diag(Args[format_idx]->getLocStart(),
196 diag::warn_printf_format_string_is_wide_literal,
197 Fn->getSourceRange());
198 return;
199 }
200
201 // Str - The format string. NOTE: this is NOT null-terminated!
202 const char * const Str = FExpr->getStrData();
203
204 // CHECK: empty format string?
205 const unsigned StrLen = FExpr->getByteLength();
206
207 if (StrLen == 0) {
208 Diag(Args[format_idx]->getLocStart(),
209 diag::warn_printf_empty_format_string, Fn->getSourceRange());
210 return;
211 }
212
213 // We process the format string using a binary state machine. The
214 // current state is stored in CurrentState.
215 enum {
216 state_OrdChr,
217 state_Conversion
218 } CurrentState = state_OrdChr;
219
220 // numConversions - The number of conversions seen so far. This is
221 // incremented as we traverse the format string.
222 unsigned numConversions = 0;
223
224 // numDataArgs - The number of data arguments after the format
225 // string. This can only be determined for non vprintf-like
226 // functions. For those functions, this value is 1 (the sole
227 // va_arg argument).
228 unsigned numDataArgs = NumArgsInCall-(format_idx+1);
229
230 // Inspect the format string.
231 unsigned StrIdx = 0;
232
233 // LastConversionIdx - Index within the format string where we last saw
234 // a '%' character that starts a new format conversion.
235 unsigned LastConversionIdx = 0;
236
237 for ( ; StrIdx < StrLen ; ++StrIdx ) {
238
239 // Is the number of detected conversion conversions greater than
240 // the number of matching data arguments? If so, stop.
241 if (!HasVAListArg && numConversions > numDataArgs) break;
242
243 // Handle "\0"
244 if(Str[StrIdx] == '\0' ) {
245 // The string returned by getStrData() is not null-terminated,
246 // so the presence of a null character is likely an error.
247
248 SourceLocation Loc =
249 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
250
251 Diag(Loc, diag::warn_printf_format_string_contains_null_char,
252 Fn->getSourceRange());
253
254 return;
255 }
256
257 // Ordinary characters (not processing a format conversion).
258 if (CurrentState == state_OrdChr) {
259 if (Str[StrIdx] == '%') {
260 CurrentState = state_Conversion;
261 LastConversionIdx = StrIdx;
262 }
263 continue;
264 }
265
266 // Seen '%'. Now processing a format conversion.
267 switch (Str[StrIdx]) {
268 // Characters which can terminate a format conversion
269 // (e.g. "%d"). Characters that specify length modifiers or
270 // other flags are handled by the default case below.
271 //
272 // TODO: additional checks will go into the following cases.
273 case 'i':
274 case 'd':
275 case 'o':
276 case 'u':
277 case 'x':
278 case 'X':
279 case 'D':
280 case 'O':
281 case 'U':
282 case 'e':
283 case 'E':
284 case 'f':
285 case 'F':
286 case 'g':
287 case 'G':
288 case 'a':
289 case 'A':
290 case 'c':
291 case 'C':
292 case 'S':
293 case 's':
294 case 'P':
295 ++numConversions;
296 CurrentState = state_OrdChr;
297 break;
298
299 // CHECK: Are we using "%n"? Issue a warning.
300 case 'n': {
301 ++numConversions;
302 CurrentState = state_OrdChr;
303 SourceLocation Loc =
304 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
305 LastConversionIdx+1);
306
307 Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
308 break;
309 }
310
311 // Handle "%%"
312 case '%':
313 // Sanity check: Was the first "%" character the previous one?
314 // If not, we will assume that we have a malformed format
315 // conversion, and that the current "%" character is the start
316 // of a new conversion.
317 if (StrIdx - LastConversionIdx == 1)
318 CurrentState = state_OrdChr;
319 else {
320 // Issue a warning: invalid format conversion.
321 SourceLocation Loc =
322 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
323 LastConversionIdx+1);
324
325 Diag(Loc, diag::warn_printf_invalid_conversion,
326 std::string(Str+LastConversionIdx, Str+StrIdx),
327 Fn->getSourceRange());
328
329 // This conversion is broken. Advance to the next format
330 // conversion.
331 LastConversionIdx = StrIdx;
332 ++numConversions;
333 }
334
335 break;
336
337 default:
338 // This case catches all other characters: flags, widths, etc.
339 // We should eventually process those as well.
340 break;
341 }
342 }
343
344 if (CurrentState == state_Conversion) {
345 // Issue a warning: invalid format conversion.
346 SourceLocation Loc =
347 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
348 LastConversionIdx+1);
349
350 Diag(Loc, diag::warn_printf_invalid_conversion,
Ted Kremenek06de2762007-08-17 16:46:58 +0000351 std::string(Str+LastConversionIdx, Str+StrIdx),
Ted Kremenek71895b92007-08-14 17:39:48 +0000352 Fn->getSourceRange());
353 return;
354 }
355
356 if (!HasVAListArg) {
357 // CHECK: Does the number of format conversions exceed the number
358 // of data arguments?
359 if (numConversions > numDataArgs) {
360 SourceLocation Loc =
361 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
362 LastConversionIdx);
363
364 Diag(Loc, diag::warn_printf_insufficient_data_args,
365 Fn->getSourceRange());
366 }
367 // CHECK: Does the number of data arguments exceed the number of
368 // format conversions in the format string?
369 else if (numConversions < numDataArgs)
370 Diag(Args[format_idx+numConversions+1]->getLocStart(),
371 diag::warn_printf_too_many_data_args, Fn->getSourceRange());
372 }
373}
Ted Kremenek06de2762007-08-17 16:46:58 +0000374
375//===--- CHECK: Return Address of Stack Variable --------------------------===//
376
377static DeclRefExpr* EvalVal(Expr *E);
378static DeclRefExpr* EvalAddr(Expr* E);
379
380/// CheckReturnStackAddr - Check if a return statement returns the address
381/// of a stack variable.
382void
383Sema::CheckReturnStackAddr(Expr *RetValExp, QualType lhsType,
384 SourceLocation ReturnLoc) {
385
386 // Perform checking for returned stack addresses.
387 if (lhsType->isPointerType()) {
388 if (DeclRefExpr *DR = EvalAddr(RetValExp))
389 Diag(DR->getLocStart(), diag::warn_ret_stack_addr,
390 DR->getDecl()->getIdentifier()->getName(),
391 RetValExp->getSourceRange());
392 }
393 // Perform checking for stack values returned by reference.
394 else if (lhsType->isReferenceType()) {
395 if (DeclRefExpr *DR = EvalVal(RetValExp))
396 Diag(DR->getLocStart(), diag::warn_ret_stack_ref,
397 DR->getDecl()->getIdentifier()->getName(),
398 RetValExp->getSourceRange());
399 }
400}
401
402/// EvalAddr - EvalAddr and EvalVal are mutually recursive functions that
403/// check if the expression in a return statement evaluates to an address
404/// to a location on the stack. The recursion is used to traverse the
405/// AST of the return expression, with recursion backtracking when we
406/// encounter a subexpression that (1) clearly does not lead to the address
407/// of a stack variable or (2) is something we cannot determine leads to
408/// the address of a stack variable based on such local checking.
409///
410/// EvalAddr processes expressions that are pointers, and EvalVal handles
411/// expressions that are rvalues or variable references.
412/// At the base case of the recursion is a check for a DeclRefExpr* in
413/// the refers to a stack variable.
414///
415/// This implementation handles:
416///
417/// * pointer-to-pointer casts
418/// * implicit conversions from array references to pointers
419/// * taking the address of fields
420/// * arbitrary interplay between "&" and "*" operators
421/// * pointer arithmetic from an address of a stack variable
422/// * taking the address of an array element where the array is on the stack
423static DeclRefExpr* EvalAddr(Expr *E) {
424
425 // We should only be called for evaluating pointer expressions.
426 assert (E->getType()->isPointerType() && "EvalAddr only works on pointers");
427
428 // Our "symbolic interpreter" is just a dispatch off the currently
429 // viewed AST node. We then recursively traverse the AST by calling
430 // EvalAddr and EvalVal appropriately.
431 switch (E->getStmtClass()) {
432
433 case Stmt::ParenExprClass:
434 // Ignore parentheses.
435 return EvalAddr(cast<ParenExpr>(E)->getSubExpr());
436
437 case Stmt::UnaryOperatorClass: {
438 // The only unary operator that make sense to handle here
439 // is AddrOf. All others don't make sense as pointers.
440 UnaryOperator *U = cast<UnaryOperator>(E);
441
442 if (U->getOpcode() == UnaryOperator::AddrOf)
443 return EvalVal(U->getSubExpr());
444 else
445 return NULL;
446 }
447
448 case Stmt::BinaryOperatorClass: {
449 // Handle pointer arithmetic. All other binary operators are not valid
450 // in this context.
451 BinaryOperator *B = cast<BinaryOperator>(E);
452 BinaryOperator::Opcode op = B->getOpcode();
453
454 if (op != BinaryOperator::Add && op != BinaryOperator::Sub)
455 return NULL;
456
457 Expr *Base = B->getLHS();
458
459 // Determine which argument is the real pointer base. It could be
460 // the RHS argument instead of the LHS.
461 if (!Base->getType()->isPointerType()) Base = B->getRHS();
462
463 assert (Base->getType()->isPointerType());
464 return EvalAddr(Base);
465 }
466
467 // For conditional operators we need to see if either the LHS or RHS are
468 // valid DeclRefExpr*s. If one of them is valid, we return it.
469 case Stmt::ConditionalOperatorClass: {
470 ConditionalOperator *C = cast<ConditionalOperator>(E);
471
472 if (DeclRefExpr* LHS = EvalAddr(C->getLHS()))
473 return LHS;
474 else
475 return EvalAddr(C->getRHS());
476 }
477
478 // For implicit casts, we need to handle conversions from arrays to
479 // pointer values, and implicit pointer-to-pointer conversions.
480 case Stmt::ImplicitCastExprClass: {
481 ImplicitCastExpr *IE = cast<ImplicitCastExpr>(E);
482 Expr* SubExpr = IE->getSubExpr();
483
484 if (SubExpr->getType()->isPointerType())
485 return EvalAddr(SubExpr);
486 else
487 return EvalVal(SubExpr);
488 }
489
490 // For casts, we handle pointer-to-pointer conversions (which
491 // is essentially a no-op from our mini-interpreter's standpoint).
492 // For other casts we abort.
493 case Stmt::CastExprClass: {
494 CastExpr *C = cast<CastExpr>(E);
495 Expr *SubExpr = C->getSubExpr();
496
497 if (SubExpr->getType()->isPointerType())
498 return EvalAddr(SubExpr);
499 else
500 return NULL;
501 }
502
Ted Kremenek23245122007-08-20 16:18:38 +0000503 // C++ casts. For dynamic casts, static casts, and const casts, we
504 // are always converting from a pointer-to-pointer, so we just blow
505 // through the cast. In the case the dynamic cast doesn't fail
506 // (and return NULL), we take the conservative route and report cases
507 // where we return the address of a stack variable. For Reinterpre
508 case Stmt::CXXCastExprClass: {
509 CXXCastExpr *C = cast<CXXCastExpr>(E);
510
511 if (C->getOpcode() == CXXCastExpr::ReinterpretCast) {
512 Expr *S = C->getSubExpr();
513 if (S->getType()->isPointerType())
514 return EvalAddr(S);
515 else
516 return NULL;
517 }
518 else
519 return EvalAddr(C->getSubExpr());
520 }
Ted Kremenek06de2762007-08-17 16:46:58 +0000521
522 // Everything else: we simply don't reason about them.
523 default:
524 return NULL;
525 }
526}
527
528
529/// EvalVal - This function is complements EvalAddr in the mutual recursion.
530/// See the comments for EvalAddr for more details.
531static DeclRefExpr* EvalVal(Expr *E) {
532
533 // We should only be called for evaluating non-pointer expressions.
534 assert (!E->getType()->isPointerType() && "EvalVal doesn't work on pointers");
535
536 // Our "symbolic interpreter" is just a dispatch off the currently
537 // viewed AST node. We then recursively traverse the AST by calling
538 // EvalAddr and EvalVal appropriately.
539 switch (E->getStmtClass()) {
540
541 case Stmt::DeclRefExprClass: {
542 // DeclRefExpr: the base case. When we hit a DeclRefExpr we are looking
543 // at code that refers to a variable's name. We check if it has local
544 // storage within the function, and if so, return the expression.
545 DeclRefExpr *DR = cast<DeclRefExpr>(E);
546
547 if (VarDecl *V = dyn_cast<VarDecl>(DR->getDecl()))
548 if(V->hasLocalStorage()) return DR;
549
550 return NULL;
551 }
552
553 case Stmt::ParenExprClass:
554 // Ignore parentheses.
555 return EvalVal(cast<ParenExpr>(E)->getSubExpr());
556
557 case Stmt::UnaryOperatorClass: {
558 // The only unary operator that make sense to handle here
559 // is Deref. All others don't resolve to a "name." This includes
560 // handling all sorts of rvalues passed to a unary operator.
561 UnaryOperator *U = cast<UnaryOperator>(E);
562
563 if (U->getOpcode() == UnaryOperator::Deref)
564 return EvalAddr(U->getSubExpr());
565
566 return NULL;
567 }
568
569 case Stmt::ArraySubscriptExprClass: {
570 // Array subscripts are potential references to data on the stack. We
571 // retrieve the DeclRefExpr* for the array variable if it indeed
572 // has local storage.
Ted Kremenek23245122007-08-20 16:18:38 +0000573 return EvalAddr(cast<ArraySubscriptExpr>(E)->getBase());
Ted Kremenek06de2762007-08-17 16:46:58 +0000574 }
575
576 case Stmt::ConditionalOperatorClass: {
577 // For conditional operators we need to see if either the LHS or RHS are
578 // non-NULL DeclRefExpr's. If one is non-NULL, we return it.
579 ConditionalOperator *C = cast<ConditionalOperator>(E);
580
581 if (DeclRefExpr *LHS = EvalVal(C->getLHS()))
582 return LHS;
583 else
584 return EvalVal(C->getRHS());
585 }
586
587 // Accesses to members are potential references to data on the stack.
588 case Stmt::MemberExprClass: {
589 MemberExpr *M = cast<MemberExpr>(E);
590
591 // Check for indirect access. We only want direct field accesses.
592 if (!M->isArrow())
593 return EvalVal(M->getBase());
594 else
595 return NULL;
596 }
597
598 // Everything else: we simply don't reason about them.
599 default:
600 return NULL;
601 }
602}