blob: d699e1bcf818f0989cc46ee1317cea0dba93afe2 [file] [log] [blame]
Chris Lattner59907c42007-08-10 20:18:51 +00001//===--- SemaChecking.cpp - Extra Semantic Checking -----------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Ted Kremenek and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements extra semantic analysis beyond what is enforced
11// by the C type system.
12//
13//===----------------------------------------------------------------------===//
14
15#include "Sema.h"
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/Decl.h"
18#include "clang/AST/Expr.h"
19#include "clang/Lex/Preprocessor.h"
20#include "clang/Lex/LiteralSupport.h"
21#include "clang/Basic/SourceManager.h"
22#include "clang/Basic/Diagnostic.h"
23#include "clang/Basic/LangOptions.h"
24#include "clang/Basic/TargetInfo.h"
25#include "llvm/ADT/SmallString.h"
26#include "llvm/ADT/StringExtras.h"
27using namespace clang;
28
29/// CheckFunctionCall - Check a direct function call for various correctness
30/// and safety properties not strictly enforced by the C type system.
Anders Carlsson71993dd2007-08-17 05:31:46 +000031bool
Ted Kremenek71895b92007-08-14 17:39:48 +000032Sema::CheckFunctionCall(Expr *Fn,
33 SourceLocation LParenLoc, SourceLocation RParenLoc,
34 FunctionDecl *FDecl,
Chris Lattner59907c42007-08-10 20:18:51 +000035 Expr** Args, unsigned NumArgsInCall) {
36
37 // Get the IdentifierInfo* for the called function.
38 IdentifierInfo *FnInfo = FDecl->getIdentifier();
39
Anders Carlsson71993dd2007-08-17 05:31:46 +000040 if (FnInfo->getBuiltinID() ==
41 Builtin::BI__builtin___CFStringMakeConstantString) {
42 assert(NumArgsInCall == 1 &&
43 "Wrong number of arguments to builtin CFStringMakeConstantString");
44 return CheckBuiltinCFStringArgument(Args[0]);
45 }
46
Chris Lattner59907c42007-08-10 20:18:51 +000047 // Search the KnownFunctionIDs for the identifier.
48 unsigned i = 0, e = id_num_known_functions;
Ted Kremenek71895b92007-08-14 17:39:48 +000049 for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000050 if (i == e) return false;
Chris Lattner59907c42007-08-10 20:18:51 +000051
52 // Printf checking.
53 if (i <= id_vprintf) {
Ted Kremenek71895b92007-08-14 17:39:48 +000054 // Retrieve the index of the format string parameter and determine
55 // if the function is passed a va_arg argument.
Chris Lattner59907c42007-08-10 20:18:51 +000056 unsigned format_idx = 0;
Ted Kremenek71895b92007-08-14 17:39:48 +000057 bool HasVAListArg = false;
58
Chris Lattner59907c42007-08-10 20:18:51 +000059 switch (i) {
60 default: assert(false && "No format string argument index.");
61 case id_printf: format_idx = 0; break;
62 case id_fprintf: format_idx = 1; break;
63 case id_sprintf: format_idx = 1; break;
64 case id_snprintf: format_idx = 2; break;
Ted Kremenek71895b92007-08-14 17:39:48 +000065 case id_asprintf: format_idx = 1; HasVAListArg = true; break;
66 case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
67 case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
68 case id_vfprintf: format_idx = 1; HasVAListArg = true; break;
69 case id_vsprintf: format_idx = 1; HasVAListArg = true; break;
70 case id_vprintf: format_idx = 0; HasVAListArg = true; break;
71 }
72
73 CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
Ted Kremenek06de2762007-08-17 16:46:58 +000074 FDecl, format_idx, Args, NumArgsInCall);
Chris Lattner59907c42007-08-10 20:18:51 +000075 }
Anders Carlsson71993dd2007-08-17 05:31:46 +000076
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000077 return false;
Anders Carlsson71993dd2007-08-17 05:31:46 +000078}
79
80/// CheckBuiltinCFStringArgument - Checks that the argument to the builtin
81/// CFString constructor is correct
82bool Sema::CheckBuiltinCFStringArgument(Expr* Arg)
83{
84 while (ParenExpr *PE = dyn_cast<ParenExpr>(Arg))
85 Arg = PE->getSubExpr();
86
87 StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);
88
89 if (!Literal || Literal->isWide()) {
90 Diag(Arg->getLocStart(),
91 diag::err_cfstring_literal_not_string_constant,
92 Arg->getSourceRange());
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000093 return true;
Anders Carlsson71993dd2007-08-17 05:31:46 +000094 }
95
96 const char *Data = Literal->getStrData();
97 unsigned Length = Literal->getByteLength();
98
99 for (unsigned i = 0; i < Length; ++i) {
100 if (!isascii(Data[i])) {
101 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
102 diag::warn_cfstring_literal_contains_non_ascii_character,
103 Arg->getSourceRange());
104 break;
105 }
106
107 if (!Data[i]) {
108 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
109 diag::warn_cfstring_literal_contains_nul_character,
110 Arg->getSourceRange());
111 break;
112 }
113 }
114
Anders Carlsson9cdc4d32007-08-17 15:44:17 +0000115 return false;
Chris Lattner59907c42007-08-10 20:18:51 +0000116}
117
118/// CheckPrintfArguments - Check calls to printf (and similar functions) for
Ted Kremenek71895b92007-08-14 17:39:48 +0000119/// correct use of format strings.
120///
121/// HasVAListArg - A predicate indicating whether the printf-like
122/// function is passed an explicit va_arg argument (e.g., vprintf)
123///
124/// format_idx - The index into Args for the format string.
125///
126/// Improper format strings to functions in the printf family can be
127/// the source of bizarre bugs and very serious security holes. A
128/// good source of information is available in the following paper
129/// (which includes additional references):
Chris Lattner59907c42007-08-10 20:18:51 +0000130///
131/// FormatGuard: Automatic Protection From printf Format String
132/// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
Ted Kremenek71895b92007-08-14 17:39:48 +0000133///
134/// Functionality implemented:
135///
136/// We can statically check the following properties for string
137/// literal format strings for non v.*printf functions (where the
138/// arguments are passed directly):
139//
140/// (1) Are the number of format conversions equal to the number of
141/// data arguments?
142///
143/// (2) Does each format conversion correctly match the type of the
144/// corresponding data argument? (TODO)
145///
146/// Moreover, for all printf functions we can:
147///
148/// (3) Check for a missing format string (when not caught by type checking).
149///
150/// (4) Check for no-operation flags; e.g. using "#" with format
151/// conversion 'c' (TODO)
152///
153/// (5) Check the use of '%n', a major source of security holes.
154///
155/// (6) Check for malformed format conversions that don't specify anything.
156///
157/// (7) Check for empty format strings. e.g: printf("");
158///
159/// (8) Check that the format string is a wide literal.
160///
161/// All of these checks can be done by parsing the format string.
162///
163/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
Chris Lattner59907c42007-08-10 20:18:51 +0000164void
Ted Kremenek71895b92007-08-14 17:39:48 +0000165Sema::CheckPrintfArguments(Expr *Fn,
166 SourceLocation LParenLoc, SourceLocation RParenLoc,
167 bool HasVAListArg, FunctionDecl *FDecl,
Ted Kremenek82077102007-08-10 21:21:05 +0000168 unsigned format_idx, Expr** Args,
169 unsigned NumArgsInCall) {
Ted Kremenek71895b92007-08-14 17:39:48 +0000170 // CHECK: printf-like function is called with no format string.
171 if (format_idx >= NumArgsInCall) {
172 Diag(RParenLoc, diag::warn_printf_missing_format_string,
173 Fn->getSourceRange());
174 return;
175 }
176
Chris Lattner59907c42007-08-10 20:18:51 +0000177 // CHECK: format string is not a string literal.
178 //
Ted Kremenek71895b92007-08-14 17:39:48 +0000179 // Dynamically generated format strings are difficult to
180 // automatically vet at compile time. Requiring that format strings
181 // are string literals: (1) permits the checking of format strings by
182 // the compiler and thereby (2) can practically remove the source of
183 // many format string exploits.
Chris Lattner59907c42007-08-10 20:18:51 +0000184 StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
185
Ted Kremenek71895b92007-08-14 17:39:48 +0000186 if (FExpr == NULL) {
187 Diag(Args[format_idx]->getLocStart(),
188 diag::warn_printf_not_string_constant, Fn->getSourceRange());
189 return;
190 }
191
192 // CHECK: is the format string a wide literal?
193 if (FExpr->isWide()) {
194 Diag(Args[format_idx]->getLocStart(),
195 diag::warn_printf_format_string_is_wide_literal,
196 Fn->getSourceRange());
197 return;
198 }
199
200 // Str - The format string. NOTE: this is NOT null-terminated!
201 const char * const Str = FExpr->getStrData();
202
203 // CHECK: empty format string?
204 const unsigned StrLen = FExpr->getByteLength();
205
206 if (StrLen == 0) {
207 Diag(Args[format_idx]->getLocStart(),
208 diag::warn_printf_empty_format_string, Fn->getSourceRange());
209 return;
210 }
211
212 // We process the format string using a binary state machine. The
213 // current state is stored in CurrentState.
214 enum {
215 state_OrdChr,
216 state_Conversion
217 } CurrentState = state_OrdChr;
218
219 // numConversions - The number of conversions seen so far. This is
220 // incremented as we traverse the format string.
221 unsigned numConversions = 0;
222
223 // numDataArgs - The number of data arguments after the format
224 // string. This can only be determined for non vprintf-like
225 // functions. For those functions, this value is 1 (the sole
226 // va_arg argument).
227 unsigned numDataArgs = NumArgsInCall-(format_idx+1);
228
229 // Inspect the format string.
230 unsigned StrIdx = 0;
231
232 // LastConversionIdx - Index within the format string where we last saw
233 // a '%' character that starts a new format conversion.
234 unsigned LastConversionIdx = 0;
235
236 for ( ; StrIdx < StrLen ; ++StrIdx ) {
237
238 // Is the number of detected conversion conversions greater than
239 // the number of matching data arguments? If so, stop.
240 if (!HasVAListArg && numConversions > numDataArgs) break;
241
242 // Handle "\0"
243 if(Str[StrIdx] == '\0' ) {
244 // The string returned by getStrData() is not null-terminated,
245 // so the presence of a null character is likely an error.
246
247 SourceLocation Loc =
248 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
249
250 Diag(Loc, diag::warn_printf_format_string_contains_null_char,
251 Fn->getSourceRange());
252
253 return;
254 }
255
256 // Ordinary characters (not processing a format conversion).
257 if (CurrentState == state_OrdChr) {
258 if (Str[StrIdx] == '%') {
259 CurrentState = state_Conversion;
260 LastConversionIdx = StrIdx;
261 }
262 continue;
263 }
264
265 // Seen '%'. Now processing a format conversion.
266 switch (Str[StrIdx]) {
267 // Characters which can terminate a format conversion
268 // (e.g. "%d"). Characters that specify length modifiers or
269 // other flags are handled by the default case below.
270 //
271 // TODO: additional checks will go into the following cases.
272 case 'i':
273 case 'd':
274 case 'o':
275 case 'u':
276 case 'x':
277 case 'X':
278 case 'D':
279 case 'O':
280 case 'U':
281 case 'e':
282 case 'E':
283 case 'f':
284 case 'F':
285 case 'g':
286 case 'G':
287 case 'a':
288 case 'A':
289 case 'c':
290 case 'C':
291 case 'S':
292 case 's':
293 case 'P':
294 ++numConversions;
295 CurrentState = state_OrdChr;
296 break;
297
298 // CHECK: Are we using "%n"? Issue a warning.
299 case 'n': {
300 ++numConversions;
301 CurrentState = state_OrdChr;
302 SourceLocation Loc =
303 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
304 LastConversionIdx+1);
305
306 Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
307 break;
308 }
309
310 // Handle "%%"
311 case '%':
312 // Sanity check: Was the first "%" character the previous one?
313 // If not, we will assume that we have a malformed format
314 // conversion, and that the current "%" character is the start
315 // of a new conversion.
316 if (StrIdx - LastConversionIdx == 1)
317 CurrentState = state_OrdChr;
318 else {
319 // Issue a warning: invalid format conversion.
320 SourceLocation Loc =
321 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
322 LastConversionIdx+1);
323
324 Diag(Loc, diag::warn_printf_invalid_conversion,
325 std::string(Str+LastConversionIdx, Str+StrIdx),
326 Fn->getSourceRange());
327
328 // This conversion is broken. Advance to the next format
329 // conversion.
330 LastConversionIdx = StrIdx;
331 ++numConversions;
332 }
333
334 break;
335
336 default:
337 // This case catches all other characters: flags, widths, etc.
338 // We should eventually process those as well.
339 break;
340 }
341 }
342
343 if (CurrentState == state_Conversion) {
344 // Issue a warning: invalid format conversion.
345 SourceLocation Loc =
346 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
347 LastConversionIdx+1);
348
349 Diag(Loc, diag::warn_printf_invalid_conversion,
Ted Kremenek06de2762007-08-17 16:46:58 +0000350 std::string(Str+LastConversionIdx, Str+StrIdx),
Ted Kremenek71895b92007-08-14 17:39:48 +0000351 Fn->getSourceRange());
352 return;
353 }
354
355 if (!HasVAListArg) {
356 // CHECK: Does the number of format conversions exceed the number
357 // of data arguments?
358 if (numConversions > numDataArgs) {
359 SourceLocation Loc =
360 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
361 LastConversionIdx);
362
363 Diag(Loc, diag::warn_printf_insufficient_data_args,
364 Fn->getSourceRange());
365 }
366 // CHECK: Does the number of data arguments exceed the number of
367 // format conversions in the format string?
368 else if (numConversions < numDataArgs)
369 Diag(Args[format_idx+numConversions+1]->getLocStart(),
370 diag::warn_printf_too_many_data_args, Fn->getSourceRange());
371 }
372}
Ted Kremenek06de2762007-08-17 16:46:58 +0000373
374//===--- CHECK: Return Address of Stack Variable --------------------------===//
375
376static DeclRefExpr* EvalVal(Expr *E);
377static DeclRefExpr* EvalAddr(Expr* E);
378
379/// CheckReturnStackAddr - Check if a return statement returns the address
380/// of a stack variable.
381void
382Sema::CheckReturnStackAddr(Expr *RetValExp, QualType lhsType,
383 SourceLocation ReturnLoc) {
384
385 // Perform checking for returned stack addresses.
386 if (lhsType->isPointerType()) {
387 if (DeclRefExpr *DR = EvalAddr(RetValExp))
388 Diag(DR->getLocStart(), diag::warn_ret_stack_addr,
389 DR->getDecl()->getIdentifier()->getName(),
390 RetValExp->getSourceRange());
391 }
392 // Perform checking for stack values returned by reference.
393 else if (lhsType->isReferenceType()) {
394 if (DeclRefExpr *DR = EvalVal(RetValExp))
395 Diag(DR->getLocStart(), diag::warn_ret_stack_ref,
396 DR->getDecl()->getIdentifier()->getName(),
397 RetValExp->getSourceRange());
398 }
399}
400
401/// EvalAddr - EvalAddr and EvalVal are mutually recursive functions that
402/// check if the expression in a return statement evaluates to an address
403/// to a location on the stack. The recursion is used to traverse the
404/// AST of the return expression, with recursion backtracking when we
405/// encounter a subexpression that (1) clearly does not lead to the address
406/// of a stack variable or (2) is something we cannot determine leads to
407/// the address of a stack variable based on such local checking.
408///
409/// EvalAddr processes expressions that are pointers, and EvalVal handles
410/// expressions that are rvalues or variable references.
411/// At the base case of the recursion is a check for a DeclRefExpr* in
412/// the refers to a stack variable.
413///
414/// This implementation handles:
415///
416/// * pointer-to-pointer casts
417/// * implicit conversions from array references to pointers
418/// * taking the address of fields
419/// * arbitrary interplay between "&" and "*" operators
420/// * pointer arithmetic from an address of a stack variable
421/// * taking the address of an array element where the array is on the stack
422static DeclRefExpr* EvalAddr(Expr *E) {
423
424 // We should only be called for evaluating pointer expressions.
425 assert (E->getType()->isPointerType() && "EvalAddr only works on pointers");
426
427 // Our "symbolic interpreter" is just a dispatch off the currently
428 // viewed AST node. We then recursively traverse the AST by calling
429 // EvalAddr and EvalVal appropriately.
430 switch (E->getStmtClass()) {
431
432 case Stmt::ParenExprClass:
433 // Ignore parentheses.
434 return EvalAddr(cast<ParenExpr>(E)->getSubExpr());
435
436 case Stmt::UnaryOperatorClass: {
437 // The only unary operator that make sense to handle here
438 // is AddrOf. All others don't make sense as pointers.
439 UnaryOperator *U = cast<UnaryOperator>(E);
440
441 if (U->getOpcode() == UnaryOperator::AddrOf)
442 return EvalVal(U->getSubExpr());
443 else
444 return NULL;
445 }
446
447 case Stmt::BinaryOperatorClass: {
448 // Handle pointer arithmetic. All other binary operators are not valid
449 // in this context.
450 BinaryOperator *B = cast<BinaryOperator>(E);
451 BinaryOperator::Opcode op = B->getOpcode();
452
453 if (op != BinaryOperator::Add && op != BinaryOperator::Sub)
454 return NULL;
455
456 Expr *Base = B->getLHS();
457
458 // Determine which argument is the real pointer base. It could be
459 // the RHS argument instead of the LHS.
460 if (!Base->getType()->isPointerType()) Base = B->getRHS();
461
462 assert (Base->getType()->isPointerType());
463 return EvalAddr(Base);
464 }
465
466 // For conditional operators we need to see if either the LHS or RHS are
467 // valid DeclRefExpr*s. If one of them is valid, we return it.
468 case Stmt::ConditionalOperatorClass: {
469 ConditionalOperator *C = cast<ConditionalOperator>(E);
470
471 if (DeclRefExpr* LHS = EvalAddr(C->getLHS()))
472 return LHS;
473 else
474 return EvalAddr(C->getRHS());
475 }
476
477 // For implicit casts, we need to handle conversions from arrays to
478 // pointer values, and implicit pointer-to-pointer conversions.
479 case Stmt::ImplicitCastExprClass: {
480 ImplicitCastExpr *IE = cast<ImplicitCastExpr>(E);
481 Expr* SubExpr = IE->getSubExpr();
482
483 if (SubExpr->getType()->isPointerType())
484 return EvalAddr(SubExpr);
485 else
486 return EvalVal(SubExpr);
487 }
488
489 // For casts, we handle pointer-to-pointer conversions (which
490 // is essentially a no-op from our mini-interpreter's standpoint).
491 // For other casts we abort.
492 case Stmt::CastExprClass: {
493 CastExpr *C = cast<CastExpr>(E);
494 Expr *SubExpr = C->getSubExpr();
495
496 if (SubExpr->getType()->isPointerType())
497 return EvalAddr(SubExpr);
498 else
499 return NULL;
500 }
501
502 // TODO: C++ casts.
503 case Stmt::CXXCastExprClass:
504 return NULL;
505
506 // Everything else: we simply don't reason about them.
507 default:
508 return NULL;
509 }
510}
511
512
513/// EvalVal - This function is complements EvalAddr in the mutual recursion.
514/// See the comments for EvalAddr for more details.
515static DeclRefExpr* EvalVal(Expr *E) {
516
517 // We should only be called for evaluating non-pointer expressions.
518 assert (!E->getType()->isPointerType() && "EvalVal doesn't work on pointers");
519
520 // Our "symbolic interpreter" is just a dispatch off the currently
521 // viewed AST node. We then recursively traverse the AST by calling
522 // EvalAddr and EvalVal appropriately.
523 switch (E->getStmtClass()) {
524
525 case Stmt::DeclRefExprClass: {
526 // DeclRefExpr: the base case. When we hit a DeclRefExpr we are looking
527 // at code that refers to a variable's name. We check if it has local
528 // storage within the function, and if so, return the expression.
529 DeclRefExpr *DR = cast<DeclRefExpr>(E);
530
531 if (VarDecl *V = dyn_cast<VarDecl>(DR->getDecl()))
532 if(V->hasLocalStorage()) return DR;
533
534 return NULL;
535 }
536
537 case Stmt::ParenExprClass:
538 // Ignore parentheses.
539 return EvalVal(cast<ParenExpr>(E)->getSubExpr());
540
541 case Stmt::UnaryOperatorClass: {
542 // The only unary operator that make sense to handle here
543 // is Deref. All others don't resolve to a "name." This includes
544 // handling all sorts of rvalues passed to a unary operator.
545 UnaryOperator *U = cast<UnaryOperator>(E);
546
547 if (U->getOpcode() == UnaryOperator::Deref)
548 return EvalAddr(U->getSubExpr());
549
550 return NULL;
551 }
552
553 case Stmt::ArraySubscriptExprClass: {
554 // Array subscripts are potential references to data on the stack. We
555 // retrieve the DeclRefExpr* for the array variable if it indeed
556 // has local storage.
557 ArraySubscriptExpr *A = cast<ArraySubscriptExpr>(E);
558
559 // The array access could be written A[4] or 4[A] (both are equivalent).
560 // In the second case, the "base" is the offset and the "Idx" is
561 // the base. We test for this case by seeing if the Base expression
562 // has a pointer type.
563 Expr* Base = A->getBase();
564
565 if (Base->getType()->isPointerType())
566 return EvalAddr(Base);
567 else
568 return EvalAddr(A->getIdx());
569 }
570
571 case Stmt::ConditionalOperatorClass: {
572 // For conditional operators we need to see if either the LHS or RHS are
573 // non-NULL DeclRefExpr's. If one is non-NULL, we return it.
574 ConditionalOperator *C = cast<ConditionalOperator>(E);
575
576 if (DeclRefExpr *LHS = EvalVal(C->getLHS()))
577 return LHS;
578 else
579 return EvalVal(C->getRHS());
580 }
581
582 // Accesses to members are potential references to data on the stack.
583 case Stmt::MemberExprClass: {
584 MemberExpr *M = cast<MemberExpr>(E);
585
586 // Check for indirect access. We only want direct field accesses.
587 if (!M->isArrow())
588 return EvalVal(M->getBase());
589 else
590 return NULL;
591 }
592
593 // Everything else: we simply don't reason about them.
594 default:
595 return NULL;
596 }
597}