blob: 511f56f2a87e2bcc2d9d92ea2f05915dc7c1ae47 [file] [log] [blame]
Chris Lattner59907c42007-08-10 20:18:51 +00001//===--- SemaChecking.cpp - Extra Semantic Checking -----------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Ted Kremenek and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements extra semantic analysis beyond what is enforced
11// by the C type system.
12//
13//===----------------------------------------------------------------------===//
14
15#include "Sema.h"
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/Decl.h"
18#include "clang/AST/Expr.h"
19#include "clang/Lex/Preprocessor.h"
20#include "clang/Lex/LiteralSupport.h"
21#include "clang/Basic/SourceManager.h"
22#include "clang/Basic/Diagnostic.h"
23#include "clang/Basic/LangOptions.h"
24#include "clang/Basic/TargetInfo.h"
25#include "llvm/ADT/SmallString.h"
26#include "llvm/ADT/StringExtras.h"
27using namespace clang;
28
29/// CheckFunctionCall - Check a direct function call for various correctness
30/// and safety properties not strictly enforced by the C type system.
31void
Ted Kremenek71895b92007-08-14 17:39:48 +000032Sema::CheckFunctionCall(Expr *Fn,
33 SourceLocation LParenLoc, SourceLocation RParenLoc,
34 FunctionDecl *FDecl,
Chris Lattner59907c42007-08-10 20:18:51 +000035 Expr** Args, unsigned NumArgsInCall) {
36
37 // Get the IdentifierInfo* for the called function.
38 IdentifierInfo *FnInfo = FDecl->getIdentifier();
39
40 // Search the KnownFunctionIDs for the identifier.
41 unsigned i = 0, e = id_num_known_functions;
Ted Kremenek71895b92007-08-14 17:39:48 +000042 for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
43 if (i == e) return;
Chris Lattner59907c42007-08-10 20:18:51 +000044
45 // Printf checking.
46 if (i <= id_vprintf) {
Ted Kremenek71895b92007-08-14 17:39:48 +000047 // Retrieve the index of the format string parameter and determine
48 // if the function is passed a va_arg argument.
Chris Lattner59907c42007-08-10 20:18:51 +000049 unsigned format_idx = 0;
Ted Kremenek71895b92007-08-14 17:39:48 +000050 bool HasVAListArg = false;
51
Chris Lattner59907c42007-08-10 20:18:51 +000052 switch (i) {
53 default: assert(false && "No format string argument index.");
54 case id_printf: format_idx = 0; break;
55 case id_fprintf: format_idx = 1; break;
56 case id_sprintf: format_idx = 1; break;
57 case id_snprintf: format_idx = 2; break;
Ted Kremenek71895b92007-08-14 17:39:48 +000058 case id_asprintf: format_idx = 1; HasVAListArg = true; break;
59 case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
60 case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
61 case id_vfprintf: format_idx = 1; HasVAListArg = true; break;
62 case id_vsprintf: format_idx = 1; HasVAListArg = true; break;
63 case id_vprintf: format_idx = 0; HasVAListArg = true; break;
64 }
65
66 CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
67 FDecl, format_idx, Args, NumArgsInCall);
Chris Lattner59907c42007-08-10 20:18:51 +000068 }
69}
70
71/// CheckPrintfArguments - Check calls to printf (and similar functions) for
Ted Kremenek71895b92007-08-14 17:39:48 +000072/// correct use of format strings.
73///
74/// HasVAListArg - A predicate indicating whether the printf-like
75/// function is passed an explicit va_arg argument (e.g., vprintf)
76///
77/// format_idx - The index into Args for the format string.
78///
79/// Improper format strings to functions in the printf family can be
80/// the source of bizarre bugs and very serious security holes. A
81/// good source of information is available in the following paper
82/// (which includes additional references):
Chris Lattner59907c42007-08-10 20:18:51 +000083///
84/// FormatGuard: Automatic Protection From printf Format String
85/// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
Ted Kremenek71895b92007-08-14 17:39:48 +000086///
87/// Functionality implemented:
88///
89/// We can statically check the following properties for string
90/// literal format strings for non v.*printf functions (where the
91/// arguments are passed directly):
92//
93/// (1) Are the number of format conversions equal to the number of
94/// data arguments?
95///
96/// (2) Does each format conversion correctly match the type of the
97/// corresponding data argument? (TODO)
98///
99/// Moreover, for all printf functions we can:
100///
101/// (3) Check for a missing format string (when not caught by type checking).
102///
103/// (4) Check for no-operation flags; e.g. using "#" with format
104/// conversion 'c' (TODO)
105///
106/// (5) Check the use of '%n', a major source of security holes.
107///
108/// (6) Check for malformed format conversions that don't specify anything.
109///
110/// (7) Check for empty format strings. e.g: printf("");
111///
112/// (8) Check that the format string is a wide literal.
113///
114/// All of these checks can be done by parsing the format string.
115///
116/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
Chris Lattner59907c42007-08-10 20:18:51 +0000117void
Ted Kremenek71895b92007-08-14 17:39:48 +0000118Sema::CheckPrintfArguments(Expr *Fn,
119 SourceLocation LParenLoc, SourceLocation RParenLoc,
120 bool HasVAListArg, FunctionDecl *FDecl,
Ted Kremenek82077102007-08-10 21:21:05 +0000121 unsigned format_idx, Expr** Args,
122 unsigned NumArgsInCall) {
Ted Kremenek71895b92007-08-14 17:39:48 +0000123 // CHECK: printf-like function is called with no format string.
124 if (format_idx >= NumArgsInCall) {
125 Diag(RParenLoc, diag::warn_printf_missing_format_string,
126 Fn->getSourceRange());
127 return;
128 }
129
Chris Lattner59907c42007-08-10 20:18:51 +0000130 // CHECK: format string is not a string literal.
131 //
Ted Kremenek71895b92007-08-14 17:39:48 +0000132 // Dynamically generated format strings are difficult to
133 // automatically vet at compile time. Requiring that format strings
134 // are string literals: (1) permits the checking of format strings by
135 // the compiler and thereby (2) can practically remove the source of
136 // many format string exploits.
Chris Lattner59907c42007-08-10 20:18:51 +0000137 StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
138
Ted Kremenek71895b92007-08-14 17:39:48 +0000139 if (FExpr == NULL) {
140 Diag(Args[format_idx]->getLocStart(),
141 diag::warn_printf_not_string_constant, Fn->getSourceRange());
142 return;
143 }
144
145 // CHECK: is the format string a wide literal?
146 if (FExpr->isWide()) {
147 Diag(Args[format_idx]->getLocStart(),
148 diag::warn_printf_format_string_is_wide_literal,
149 Fn->getSourceRange());
150 return;
151 }
152
153 // Str - The format string. NOTE: this is NOT null-terminated!
154 const char * const Str = FExpr->getStrData();
155
156 // CHECK: empty format string?
157 const unsigned StrLen = FExpr->getByteLength();
158
159 if (StrLen == 0) {
160 Diag(Args[format_idx]->getLocStart(),
161 diag::warn_printf_empty_format_string, Fn->getSourceRange());
162 return;
163 }
164
165 // We process the format string using a binary state machine. The
166 // current state is stored in CurrentState.
167 enum {
168 state_OrdChr,
169 state_Conversion
170 } CurrentState = state_OrdChr;
171
172 // numConversions - The number of conversions seen so far. This is
173 // incremented as we traverse the format string.
174 unsigned numConversions = 0;
175
176 // numDataArgs - The number of data arguments after the format
177 // string. This can only be determined for non vprintf-like
178 // functions. For those functions, this value is 1 (the sole
179 // va_arg argument).
180 unsigned numDataArgs = NumArgsInCall-(format_idx+1);
181
182 // Inspect the format string.
183 unsigned StrIdx = 0;
184
185 // LastConversionIdx - Index within the format string where we last saw
186 // a '%' character that starts a new format conversion.
187 unsigned LastConversionIdx = 0;
188
189 for ( ; StrIdx < StrLen ; ++StrIdx ) {
190
191 // Is the number of detected conversion conversions greater than
192 // the number of matching data arguments? If so, stop.
193 if (!HasVAListArg && numConversions > numDataArgs) break;
194
195 // Handle "\0"
196 if(Str[StrIdx] == '\0' ) {
197 // The string returned by getStrData() is not null-terminated,
198 // so the presence of a null character is likely an error.
199
200 SourceLocation Loc =
201 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
202
203 Diag(Loc, diag::warn_printf_format_string_contains_null_char,
204 Fn->getSourceRange());
205
206 return;
207 }
208
209 // Ordinary characters (not processing a format conversion).
210 if (CurrentState == state_OrdChr) {
211 if (Str[StrIdx] == '%') {
212 CurrentState = state_Conversion;
213 LastConversionIdx = StrIdx;
214 }
215 continue;
216 }
217
218 // Seen '%'. Now processing a format conversion.
219 switch (Str[StrIdx]) {
220 // Characters which can terminate a format conversion
221 // (e.g. "%d"). Characters that specify length modifiers or
222 // other flags are handled by the default case below.
223 //
224 // TODO: additional checks will go into the following cases.
225 case 'i':
226 case 'd':
227 case 'o':
228 case 'u':
229 case 'x':
230 case 'X':
231 case 'D':
232 case 'O':
233 case 'U':
234 case 'e':
235 case 'E':
236 case 'f':
237 case 'F':
238 case 'g':
239 case 'G':
240 case 'a':
241 case 'A':
242 case 'c':
243 case 'C':
244 case 'S':
245 case 's':
246 case 'P':
247 ++numConversions;
248 CurrentState = state_OrdChr;
249 break;
250
251 // CHECK: Are we using "%n"? Issue a warning.
252 case 'n': {
253 ++numConversions;
254 CurrentState = state_OrdChr;
255 SourceLocation Loc =
256 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
257 LastConversionIdx+1);
258
259 Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
260 break;
261 }
262
263 // Handle "%%"
264 case '%':
265 // Sanity check: Was the first "%" character the previous one?
266 // If not, we will assume that we have a malformed format
267 // conversion, and that the current "%" character is the start
268 // of a new conversion.
269 if (StrIdx - LastConversionIdx == 1)
270 CurrentState = state_OrdChr;
271 else {
272 // Issue a warning: invalid format conversion.
273 SourceLocation Loc =
274 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
275 LastConversionIdx+1);
276
277 Diag(Loc, diag::warn_printf_invalid_conversion,
278 std::string(Str+LastConversionIdx, Str+StrIdx),
279 Fn->getSourceRange());
280
281 // This conversion is broken. Advance to the next format
282 // conversion.
283 LastConversionIdx = StrIdx;
284 ++numConversions;
285 }
286
287 break;
288
289 default:
290 // This case catches all other characters: flags, widths, etc.
291 // We should eventually process those as well.
292 break;
293 }
294 }
295
296 if (CurrentState == state_Conversion) {
297 // Issue a warning: invalid format conversion.
298 SourceLocation Loc =
299 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
300 LastConversionIdx+1);
301
302 Diag(Loc, diag::warn_printf_invalid_conversion,
303 std::string(Str+LastConversionIdx, Str+StrIdx),
304 Fn->getSourceRange());
305 return;
306 }
307
308 if (!HasVAListArg) {
309 // CHECK: Does the number of format conversions exceed the number
310 // of data arguments?
311 if (numConversions > numDataArgs) {
312 SourceLocation Loc =
313 PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
314 LastConversionIdx);
315
316 Diag(Loc, diag::warn_printf_insufficient_data_args,
317 Fn->getSourceRange());
318 }
319 // CHECK: Does the number of data arguments exceed the number of
320 // format conversions in the format string?
321 else if (numConversions < numDataArgs)
322 Diag(Args[format_idx+numConversions+1]->getLocStart(),
323 diag::warn_printf_too_many_data_args, Fn->getSourceRange());
324 }
325}