blob: 1917d6317d87c9754f4ae1f17b34a6e3c7aab636 [file] [log] [blame]
Chris Lattner59907c42007-08-10 20:18:51 +00001//===--- SemaChecking.cpp - Extra Semantic Checking -----------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Ted Kremenek and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements extra semantic analysis beyond what is enforced
11// by the C type system.
12//
13//===----------------------------------------------------------------------===//
14
15#include "Sema.h"
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/Decl.h"
18#include "clang/AST/Expr.h"
19#include "clang/Lex/Preprocessor.h"
20#include "clang/Lex/LiteralSupport.h"
21#include "clang/Basic/SourceManager.h"
22#include "clang/Basic/Diagnostic.h"
23#include "clang/Basic/LangOptions.h"
24#include "clang/Basic/TargetInfo.h"
25#include "llvm/ADT/SmallString.h"
26#include "llvm/ADT/StringExtras.h"
27using namespace clang;
28
29/// CheckFunctionCall - Check a direct function call for various correctness
30/// and safety properties not strictly enforced by the C type system.
Anders Carlsson71993dd2007-08-17 05:31:46 +000031bool
Ted Kremenek71895b92007-08-14 17:39:48 +000032Sema::CheckFunctionCall(Expr *Fn,
33 SourceLocation LParenLoc, SourceLocation RParenLoc,
34 FunctionDecl *FDecl,
Chris Lattner59907c42007-08-10 20:18:51 +000035 Expr** Args, unsigned NumArgsInCall) {
36
37 // Get the IdentifierInfo* for the called function.
38 IdentifierInfo *FnInfo = FDecl->getIdentifier();
39
Anders Carlsson71993dd2007-08-17 05:31:46 +000040 if (FnInfo->getBuiltinID() ==
41 Builtin::BI__builtin___CFStringMakeConstantString) {
42 assert(NumArgsInCall == 1 &&
43 "Wrong number of arguments to builtin CFStringMakeConstantString");
44 return CheckBuiltinCFStringArgument(Args[0]);
45 }
46
Chris Lattner59907c42007-08-10 20:18:51 +000047 // Search the KnownFunctionIDs for the identifier.
48 unsigned i = 0, e = id_num_known_functions;
Ted Kremenek71895b92007-08-14 17:39:48 +000049 for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000050 if (i == e) return false;
Chris Lattner59907c42007-08-10 20:18:51 +000051
52 // Printf checking.
53 if (i <= id_vprintf) {
Ted Kremenek71895b92007-08-14 17:39:48 +000054 // Retrieve the index of the format string parameter and determine
55 // if the function is passed a va_arg argument.
Chris Lattner59907c42007-08-10 20:18:51 +000056 unsigned format_idx = 0;
Ted Kremenek71895b92007-08-14 17:39:48 +000057 bool HasVAListArg = false;
58
Chris Lattner59907c42007-08-10 20:18:51 +000059 switch (i) {
60 default: assert(false && "No format string argument index.");
61 case id_printf: format_idx = 0; break;
62 case id_fprintf: format_idx = 1; break;
63 case id_sprintf: format_idx = 1; break;
64 case id_snprintf: format_idx = 2; break;
Ted Kremenek71895b92007-08-14 17:39:48 +000065 case id_asprintf: format_idx = 1; HasVAListArg = true; break;
66 case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
67 case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
68 case id_vfprintf: format_idx = 1; HasVAListArg = true; break;
69 case id_vsprintf: format_idx = 1; HasVAListArg = true; break;
70 case id_vprintf: format_idx = 0; HasVAListArg = true; break;
71 }
72
73 CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
74 FDecl, format_idx, Args, NumArgsInCall);
Chris Lattner59907c42007-08-10 20:18:51 +000075 }
Anders Carlsson71993dd2007-08-17 05:31:46 +000076
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000077 return false;
Anders Carlsson71993dd2007-08-17 05:31:46 +000078}
79
80/// CheckBuiltinCFStringArgument - Checks that the argument to the builtin
81/// CFString constructor is correct
82bool Sema::CheckBuiltinCFStringArgument(Expr* Arg)
83{
84 while (ParenExpr *PE = dyn_cast<ParenExpr>(Arg))
85 Arg = PE->getSubExpr();
86
87 StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);
88
89 if (!Literal || Literal->isWide()) {
90 Diag(Arg->getLocStart(),
91 diag::err_cfstring_literal_not_string_constant,
92 Arg->getSourceRange());
Anders Carlsson9cdc4d32007-08-17 15:44:17 +000093 return true;
Anders Carlsson71993dd2007-08-17 05:31:46 +000094 }
95
96 const char *Data = Literal->getStrData();
97 unsigned Length = Literal->getByteLength();
98
99 for (unsigned i = 0; i < Length; ++i) {
100 if (!isascii(Data[i])) {
101 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
102 diag::warn_cfstring_literal_contains_non_ascii_character,
103 Arg->getSourceRange());
104 break;
105 }
106
107 if (!Data[i]) {
108 Diag(PP.AdvanceToTokenCharacter(Arg->getLocStart(), i + 1),
109 diag::warn_cfstring_literal_contains_nul_character,
110 Arg->getSourceRange());
111 break;
112 }
113 }
114
Anders Carlsson9cdc4d32007-08-17 15:44:17 +0000115 return false;
Chris Lattner59907c42007-08-10 20:18:51 +0000116}
117
118/// CheckPrintfArguments - Check calls to printf (and similar functions) for
Ted Kremenek71895b92007-08-14 17:39:48 +0000119/// correct use of format strings.
120///
121/// HasVAListArg - A predicate indicating whether the printf-like
122/// function is passed an explicit va_arg argument (e.g., vprintf)
123///
124/// format_idx - The index into Args for the format string.
125///
126/// Improper format strings to functions in the printf family can be
127/// the source of bizarre bugs and very serious security holes. A
128/// good source of information is available in the following paper
129/// (which includes additional references):
Chris Lattner59907c42007-08-10 20:18:51 +0000130///
131/// FormatGuard: Automatic Protection From printf Format String
132/// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
Ted Kremenek71895b92007-08-14 17:39:48 +0000133///
134/// Functionality implemented:
135///
136/// We can statically check the following properties for string
137/// literal format strings for non v.*printf functions (where the
138/// arguments are passed directly):
139//
140/// (1) Are the number of format conversions equal to the number of
141/// data arguments?
142///
143/// (2) Does each format conversion correctly match the type of the
144/// corresponding data argument? (TODO)
145///
146/// Moreover, for all printf functions we can:
147///
148/// (3) Check for a missing format string (when not caught by type checking).
149///
150/// (4) Check for no-operation flags; e.g. using "#" with format
151/// conversion 'c' (TODO)
152///
153/// (5) Check the use of '%n', a major source of security holes.
154///
155/// (6) Check for malformed format conversions that don't specify anything.
156///
157/// (7) Check for empty format strings. e.g: printf("");
158///
159/// (8) Check that the format string is a wide literal.
160///
161/// All of these checks can be done by parsing the format string.
162///
163/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
Chris Lattner59907c42007-08-10 20:18:51 +0000164void
Ted Kremenek71895b92007-08-14 17:39:48 +0000165Sema::CheckPrintfArguments(Expr *Fn,
166 SourceLocation LParenLoc, SourceLocation RParenLoc,
167 bool HasVAListArg, FunctionDecl *FDecl,
Ted Kremenek82077102007-08-10 21:21:05 +0000168 unsigned format_idx, Expr** Args,
169 unsigned NumArgsInCall) {
Ted Kremenek71895b92007-08-14 17:39:48 +0000170 // CHECK: printf-like function is called with no format string.
171 if (format_idx >= NumArgsInCall) {
172 Diag(RParenLoc, diag::warn_printf_missing_format_string,
173 Fn->getSourceRange());
174 return;
175 }
176
Chris Lattner59907c42007-08-10 20:18:51 +0000177 // CHECK: format string is not a string literal.
178 //
Ted Kremenek71895b92007-08-14 17:39:48 +0000179 // Dynamically generated format strings are difficult to
180 // automatically vet at compile time. Requiring that format strings
181 // are string literals: (1) permits the checking of format strings by
182 // the compiler and thereby (2) can practically remove the source of
183 // many format string exploits.
Chris Lattner59907c42007-08-10 20:18:51 +0000184 StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
185
Ted Kremenek71895b92007-08-14 17:39:48 +0000186 if (FExpr == NULL) {
187 Diag(Args[format_idx]->getLocStart(),
188 diag::warn_printf_not_string_constant, Fn->getSourceRange());
189 return;
190 }
191
192 // CHECK: is the format string a wide literal?
193 if (FExpr->isWide()) {
194 Diag(Args[format_idx]->getLocStart(),
195 diag::warn_printf_format_string_is_wide_literal,
196 Fn->getSourceRange());
197 return;
198 }
199
200 // Str - The format string. NOTE: this is NOT null-terminated!
201 const char * const Str = FExpr->getStrData();
202
203 // CHECK: empty format string?
204 const unsigned StrLen = FExpr->getByteLength();
205
206 if (StrLen == 0) {
207 Diag(Args[format_idx]->getLocStart(),
208 diag::warn_printf_empty_format_string, Fn->getSourceRange());
209 return;
210 }
211
212 // We process the format string using a binary state machine. The
213 // current state is stored in CurrentState.
214 enum {
215 state_OrdChr,
216 state_Conversion
217 } CurrentState = state_OrdChr;
218
219 // numConversions - The number of conversions seen so far. This is
220 // incremented as we traverse the format string.
221 unsigned numConversions = 0;
222
223 // numDataArgs - The number of data arguments after the format
224 // string. This can only be determined for non vprintf-like
225 // functions. For those functions, this value is 1 (the sole
226 // va_arg argument).
227 unsigned numDataArgs = NumArgsInCall-(format_idx+1);
228
229 // Inspect the format string.
230 unsigned StrIdx = 0;
231
232 // LastConversionIdx - Index within the format string where we last saw
233 // a '%' character that starts a new format conversion.
234 unsigned LastConversionIdx = 0;
235
236 for ( ; StrIdx < StrLen ; ++StrIdx ) {
237
238 // Is the number of detected conversion conversions greater than
239 // the number of matching data arguments? If so, stop.
240 if (!HasVAListArg && numConversions > numDataArgs) break;
241
242 // Handle "\0"
243 if(Str[StrIdx] == '\0' ) {
244 // The string returned by getStrData() is not null-terminated,
245 // so the presence of a null character is likely an error.
246
247 SourceLocation Loc =
248 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
249
250 Diag(Loc, diag::warn_printf_format_string_contains_null_char,
251 Fn->getSourceRange());
252
253 return;
254 }
255
256 // Ordinary characters (not processing a format conversion).
257 if (CurrentState == state_OrdChr) {
258 if (Str[StrIdx] == '%') {
259 CurrentState = state_Conversion;
260 LastConversionIdx = StrIdx;
261 }
262 continue;
263 }
264
265 // Seen '%'. Now processing a format conversion.
266 switch (Str[StrIdx]) {
267 // Characters which can terminate a format conversion
268 // (e.g. "%d"). Characters that specify length modifiers or
269 // other flags are handled by the default case below.
270 //
271 // TODO: additional checks will go into the following cases.
272 case 'i':
273 case 'd':
274 case 'o':
275 case 'u':
276 case 'x':
277 case 'X':
278 case 'D':
279 case 'O':
280 case 'U':
281 case 'e':
282 case 'E':
283 case 'f':
284 case 'F':
285 case 'g':
286 case 'G':
287 case 'a':
288 case 'A':
289 case 'c':
290 case 'C':
291 case 'S':
292 case 's':
293 case 'P':
294 ++numConversions;
295 CurrentState = state_OrdChr;
296 break;
297
298 // CHECK: Are we using "%n"? Issue a warning.
299 case 'n': {
300 ++numConversions;
301 CurrentState = state_OrdChr;
302 SourceLocation Loc =
303 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
304 LastConversionIdx+1);
305
306 Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
307 break;
308 }
309
310 // Handle "%%"
311 case '%':
312 // Sanity check: Was the first "%" character the previous one?
313 // If not, we will assume that we have a malformed format
314 // conversion, and that the current "%" character is the start
315 // of a new conversion.
316 if (StrIdx - LastConversionIdx == 1)
317 CurrentState = state_OrdChr;
318 else {
319 // Issue a warning: invalid format conversion.
320 SourceLocation Loc =
321 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
322 LastConversionIdx+1);
323
324 Diag(Loc, diag::warn_printf_invalid_conversion,
325 std::string(Str+LastConversionIdx, Str+StrIdx),
326 Fn->getSourceRange());
327
328 // This conversion is broken. Advance to the next format
329 // conversion.
330 LastConversionIdx = StrIdx;
331 ++numConversions;
332 }
333
334 break;
335
336 default:
337 // This case catches all other characters: flags, widths, etc.
338 // We should eventually process those as well.
339 break;
340 }
341 }
342
343 if (CurrentState == state_Conversion) {
344 // Issue a warning: invalid format conversion.
345 SourceLocation Loc =
346 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
347 LastConversionIdx+1);
348
349 Diag(Loc, diag::warn_printf_invalid_conversion,
350 std::string(Str+LastConversionIdx, Str+StrIdx),
351 Fn->getSourceRange());
352 return;
353 }
354
355 if (!HasVAListArg) {
356 // CHECK: Does the number of format conversions exceed the number
357 // of data arguments?
358 if (numConversions > numDataArgs) {
359 SourceLocation Loc =
360 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
361 LastConversionIdx);
362
363 Diag(Loc, diag::warn_printf_insufficient_data_args,
364 Fn->getSourceRange());
365 }
366 // CHECK: Does the number of data arguments exceed the number of
367 // format conversions in the format string?
368 else if (numConversions < numDataArgs)
369 Diag(Args[format_idx+numConversions+1]->getLocStart(),
370 diag::warn_printf_too_many_data_args, Fn->getSourceRange());
371 }
372}