blob: 9bb5073a8b21bc1d6a3a397653867de5b748709a [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-macro-assembler.h"
6
7#include "src/assembler.h"
8#include "src/isolate-inl.h"
9#include "src/regexp/regexp-stack.h"
10#include "src/simulator.h"
11
Ben Murdoch097c5b22016-05-18 11:27:45 +010012#ifdef V8_I18N_SUPPORT
13#include "unicode/uchar.h"
14#endif // V8_I18N_SUPPORT
15
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000016namespace v8 {
17namespace internal {
18
19RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20 : slow_safe_compiler_(false),
21 global_mode_(NOT_GLOBAL),
22 isolate_(isolate),
23 zone_(zone) {}
24
25
26RegExpMacroAssembler::~RegExpMacroAssembler() {
27}
28
29
Ben Murdoch097c5b22016-05-18 11:27:45 +010030int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31 Address byte_offset2,
32 size_t byte_length,
33 Isolate* isolate) {
34 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35 isolate->regexp_macro_assembler_canonicalize();
36 // This function is not allowed to cause a garbage collection.
37 // A GC might move the calling generated code and invalidate the
38 // return address on the stack.
39 DCHECK(byte_length % 2 == 0);
40 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42 size_t length = byte_length >> 1;
43
44#ifdef V8_I18N_SUPPORT
45 if (isolate == nullptr) {
46 for (size_t i = 0; i < length; i++) {
47 uc32 c1 = substring1[i];
48 uc32 c2 = substring2[i];
49 if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50 // Non-BMP characters do not have case-equivalents in the BMP.
51 // Both have to be non-BMP for them to be able to match.
52 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53 if (i + 1 < length) {
54 uc16 c1t = substring1[i + 1];
55 uc16 c2t = substring2[i + 1];
56 if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57 unibrow::Utf16::IsTrailSurrogate(c2t)) {
58 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60 i++;
61 }
62 }
63 }
64 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66 if (c1 != c2) return 0;
67 }
68 return 1;
69 }
70#endif // V8_I18N_SUPPORT
71 DCHECK_NOT_NULL(isolate);
72 for (size_t i = 0; i < length; i++) {
73 unibrow::uchar c1 = substring1[i];
74 unibrow::uchar c2 = substring2[i];
75 if (c1 != c2) {
76 unibrow::uchar s1[1] = {c1};
77 canonicalize->get(c1, '\0', s1);
78 if (s1[0] != c2) {
79 unibrow::uchar s2[1] = {c2};
80 canonicalize->get(c2, '\0', s2);
81 if (s1[0] != s2[0]) {
82 return 0;
83 }
84 }
85 }
86 }
87 return 1;
88}
89
90
91void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92 Label* on_failure) {
93 Label ok;
94 // Check that current character is not a trail surrogate.
95 LoadCurrentCharacter(cp_offset, &ok);
96 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97 // Check that previous character is not a lead surrogate.
98 LoadCurrentCharacter(cp_offset - 1, &ok);
99 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100 Bind(&ok);
101}
102
103
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000104#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
105
106NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
107 Zone* zone)
108 : RegExpMacroAssembler(isolate, zone) {}
109
110
111NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
112}
113
114
115bool NativeRegExpMacroAssembler::CanReadUnaligned() {
116 return FLAG_enable_unaligned_accesses && !slow_safe();
117}
118
119const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
120 String* subject,
121 int start_index) {
122 if (subject->IsConsString()) {
123 subject = ConsString::cast(subject)->first();
124 } else if (subject->IsSlicedString()) {
125 start_index += SlicedString::cast(subject)->offset();
126 subject = SlicedString::cast(subject)->parent();
127 }
128 DCHECK(start_index >= 0);
129 DCHECK(start_index <= subject->length());
130 if (subject->IsSeqOneByteString()) {
131 return reinterpret_cast<const byte*>(
132 SeqOneByteString::cast(subject)->GetChars() + start_index);
133 } else if (subject->IsSeqTwoByteString()) {
134 return reinterpret_cast<const byte*>(
135 SeqTwoByteString::cast(subject)->GetChars() + start_index);
136 } else if (subject->IsExternalOneByteString()) {
137 return reinterpret_cast<const byte*>(
138 ExternalOneByteString::cast(subject)->GetChars() + start_index);
139 } else {
140 return reinterpret_cast<const byte*>(
141 ExternalTwoByteString::cast(subject)->GetChars() + start_index);
142 }
143}
144
145
146int NativeRegExpMacroAssembler::CheckStackGuardState(
147 Isolate* isolate, int start_index, bool is_direct_call,
148 Address* return_address, Code* re_code, String** subject,
149 const byte** input_start, const byte** input_end) {
150 DCHECK(re_code->instruction_start() <= *return_address);
151 DCHECK(*return_address <= re_code->instruction_end());
152 int return_value = 0;
153 // Prepare for possible GC.
154 HandleScope handles(isolate);
155 Handle<Code> code_handle(re_code);
156 Handle<String> subject_handle(*subject);
157 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
158
159 StackLimitCheck check(isolate);
160 if (check.JsHasOverflowed()) {
161 isolate->StackOverflow();
162 return_value = EXCEPTION;
163 } else if (is_direct_call) {
164 // If not real stack overflow the stack guard was used to interrupt
165 // execution for another purpose. If this is a direct call from JavaScript
166 // retry the RegExp forcing the call through the runtime system.
167 // Currently the direct call cannot handle a GC.
168 return_value = RETRY;
169 } else {
170 Object* result = isolate->stack_guard()->HandleInterrupts();
171 if (result->IsException()) return_value = EXCEPTION;
172 }
173
174 DisallowHeapAllocation no_gc;
175
176 if (*code_handle != re_code) { // Return address no longer valid
177 intptr_t delta = code_handle->address() - re_code->address();
178 // Overwrite the return address on the stack.
179 *return_address += delta;
180 }
181
182 // If we continue, we need to update the subject string addresses.
183 if (return_value == 0) {
184 // String encoding might have changed.
185 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
186 // If we changed between an LATIN1 and an UC16 string, the specialized
187 // code cannot be used, and we need to restart regexp matching from
188 // scratch (including, potentially, compiling a new version of the code).
189 return_value = RETRY;
190 } else {
191 *subject = *subject_handle;
192 intptr_t byte_length = *input_end - *input_start;
193 *input_start = StringCharacterPosition(*subject, start_index);
194 *input_end = *input_start + byte_length;
195 }
196 }
197 return return_value;
198}
199
200
201NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
202 Handle<Code> regexp_code,
203 Handle<String> subject,
204 int* offsets_vector,
205 int offsets_vector_length,
206 int previous_index,
207 Isolate* isolate) {
208
209 DCHECK(subject->IsFlat());
210 DCHECK(previous_index >= 0);
211 DCHECK(previous_index <= subject->length());
212
213 // No allocations before calling the regexp, but we can't use
214 // DisallowHeapAllocation, since regexps might be preempted, and another
215 // thread might do allocation anyway.
216
217 String* subject_ptr = *subject;
218 // Character offsets into string.
219 int start_offset = previous_index;
220 int char_length = subject_ptr->length() - start_offset;
221 int slice_offset = 0;
222
223 // The string has been flattened, so if it is a cons string it contains the
224 // full string in the first part.
225 if (StringShape(subject_ptr).IsCons()) {
226 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
227 subject_ptr = ConsString::cast(subject_ptr)->first();
228 } else if (StringShape(subject_ptr).IsSliced()) {
229 SlicedString* slice = SlicedString::cast(subject_ptr);
230 subject_ptr = slice->parent();
231 slice_offset = slice->offset();
232 }
233 // Ensure that an underlying string has the same representation.
234 bool is_one_byte = subject_ptr->IsOneByteRepresentation();
235 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
236 // String is now either Sequential or External
237 int char_size_shift = is_one_byte ? 0 : 1;
238
239 const byte* input_start =
240 StringCharacterPosition(subject_ptr, start_offset + slice_offset);
241 int byte_length = char_length << char_size_shift;
242 const byte* input_end = input_start + byte_length;
243 Result res = Execute(*regexp_code,
244 *subject,
245 start_offset,
246 input_start,
247 input_end,
248 offsets_vector,
249 offsets_vector_length,
250 isolate);
251 return res;
252}
253
254
255NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
256 Code* code,
257 String* input, // This needs to be the unpacked (sliced, cons) string.
258 int start_offset,
259 const byte* input_start,
260 const byte* input_end,
261 int* output,
262 int output_size,
263 Isolate* isolate) {
264 // Ensure that the minimum stack has been allocated.
265 RegExpStackScope stack_scope(isolate);
266 Address stack_base = stack_scope.stack()->stack_base();
267
268 int direct_call = 0;
269 int result = CALL_GENERATED_REGEXP_CODE(
270 isolate, code->entry(), input, start_offset, input_start, input_end,
271 output, output_size, stack_base, direct_call, isolate);
272 DCHECK(result >= RETRY);
273
274 if (result == EXCEPTION && !isolate->has_pending_exception()) {
275 // We detected a stack overflow (on the backtrack stack) in RegExp code,
276 // but haven't created the exception yet.
277 isolate->StackOverflow();
278 }
279 return static_cast<Result>(result);
280}
281
282
283const byte NativeRegExpMacroAssembler::word_character_map[] = {
284 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
285 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
286 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
287 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
288
289 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
290 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
291 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
292 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
293
294 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
295 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
296 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
297 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
298
299 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
300 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
301 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
302 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
303 // Latin-1 range
304 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
308
309 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
310 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
311 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
312 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
313
314 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
315 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
318
319 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
320 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323};
324
325
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000326Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
327 Address* stack_base,
328 Isolate* isolate) {
329 RegExpStack* regexp_stack = isolate->regexp_stack();
330 size_t size = regexp_stack->stack_capacity();
331 Address old_stack_base = regexp_stack->stack_base();
332 DCHECK(old_stack_base == *stack_base);
333 DCHECK(stack_pointer <= old_stack_base);
334 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
335 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
336 if (new_stack_base == NULL) {
337 return NULL;
338 }
339 *stack_base = new_stack_base;
340 intptr_t stack_content_size = old_stack_base - stack_pointer;
341 return new_stack_base - stack_content_size;
342}
343
344#endif // V8_INTERPRETED_REGEXP
345
346} // namespace internal
347} // namespace v8