blob: 53e37cbcabebee1a1d827e9f0a4c5334b8e51675 [file] [log] [blame]
Janis Danisevskis112c9cc2016-03-31 13:35:25 +01001# This set of tests is for UTF-8 support and Unicode property support, with
2# relevance only for the 8-bit library.
3
Elliott Hughes2dbd7d22020-06-03 14:32:37 -07004# The next 5 patterns have UTF-8 errors
Janis Danisevskis112c9cc2016-03-31 13:35:25 +01005
6/[Ã]/utf
7
8/Ã/utf
9
10/ÃÃÃxxx/utf
11
12/‚‚‚‚‚‚‚Ã/utf
13
Elliott Hughes2dbd7d22020-06-03 14:32:37 -070014/‚‚‚‚‚‚‚Ã/match_invalid_utf
15
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010016# Now test subjects
17
18/badutf/utf
19\= Expect UTF-8 errors
20 X\xdf
21 XX\xef
22 XXX\xef\x80
23 X\xf7
24 XX\xf7\x80
25 XXX\xf7\x80\x80
26 \xfb
27 \xfb\x80
28 \xfb\x80\x80
29 \xfb\x80\x80\x80
30 \xfd
31 \xfd\x80
32 \xfd\x80\x80
33 \xfd\x80\x80\x80
34 \xfd\x80\x80\x80\x80
35 \xdf\x7f
36 \xef\x7f\x80
37 \xef\x80\x7f
38 \xf7\x7f\x80\x80
39 \xf7\x80\x7f\x80
40 \xf7\x80\x80\x7f
41 \xfb\x7f\x80\x80\x80
42 \xfb\x80\x7f\x80\x80
43 \xfb\x80\x80\x7f\x80
44 \xfb\x80\x80\x80\x7f
45 \xfd\x7f\x80\x80\x80\x80
46 \xfd\x80\x7f\x80\x80\x80
47 \xfd\x80\x80\x7f\x80\x80
48 \xfd\x80\x80\x80\x7f\x80
49 \xfd\x80\x80\x80\x80\x7f
50 \xed\xa0\x80
51 \xc0\x8f
52 \xe0\x80\x8f
53 \xf0\x80\x80\x8f
54 \xf8\x80\x80\x80\x8f
55 \xfc\x80\x80\x80\x80\x8f
56 \x80
57 \xfe
58 \xff
59
60/badutf/utf
61\= Expect UTF-8 errors
62 XX\xfb\x80\x80\x80\x80
63 XX\xfd\x80\x80\x80\x80\x80
64 XX\xf7\xbf\xbf\xbf
65
66/shortutf/utf
67\= Expect UTF-8 errors
68 XX\xdf\=ph
69 XX\xef\=ph
70 XX\xef\x80\=ph
71 \xf7\=ph
72 \xf7\x80\=ph
73 \xf7\x80\x80\=ph
74 \xfb\=ph
75 \xfb\x80\=ph
76 \xfb\x80\x80\=ph
77 \xfb\x80\x80\x80\=ph
78 \xfd\=ph
79 \xfd\x80\=ph
80 \xfd\x80\x80\=ph
81 \xfd\x80\x80\x80\=ph
82 \xfd\x80\x80\x80\x80\=ph
83
84/anything/utf
85\= Expect UTF-8 errors
86 X\xc0\x80
87 XX\xc1\x8f
88 XXX\xe0\x9f\x80
89 \xf0\x8f\x80\x80
90 \xf8\x87\x80\x80\x80
91 \xfc\x83\x80\x80\x80\x80
92 \xfe\x80\x80\x80\x80\x80
93 \xff\x80\x80\x80\x80\x80
94 \xf8\x88\x80\x80\x80
95 \xf9\x87\x80\x80\x80
96 \xfc\x84\x80\x80\x80\x80
97 \xfd\x83\x80\x80\x80\x80
98\= Expect no match
99 \xc3\x8f
100 \xe0\xaf\x80
101 \xe1\x80\x80
102 \xf0\x9f\x80\x80
103 \xf1\x8f\x80\x80
104 \xf8\x88\x80\x80\x80\=no_utf_check
105 \xf9\x87\x80\x80\x80\=no_utf_check
106 \xfc\x84\x80\x80\x80\x80\=no_utf_check
107 \xfd\x83\x80\x80\x80\x80\=no_utf_check
108
109# Similar tests with offsets
110
111/badutf/utf
112\= Expect UTF-8 errors
113 X\xdfabcd
114 X\xdfabcd\=offset=1
115\= Expect no match
116 X\xdfabcd\=offset=2
117
118/(?<=x)badutf/utf
119\= Expect UTF-8 errors
120 X\xdfabcd
121 X\xdfabcd\=offset=1
122 X\xdfabcd\=offset=2
123 X\xdfabcd\xdf\=offset=3
124\= Expect no match
125 X\xdfabcd\=offset=3
126
127/(?<=xx)badutf/utf
128\= Expect UTF-8 errors
129 X\xdfabcd
130 X\xdfabcd\=offset=1
131 X\xdfabcd\=offset=2
132 X\xdfabcd\=offset=3
133
134/(?<=xxxx)badutf/utf
135\= Expect UTF-8 errors
136 X\xdfabcd
137 X\xdfabcd\=offset=1
138 X\xdfabcd\=offset=2
139 X\xdfabcd\=offset=3
140 X\xdfabc\xdf\=offset=6
141 X\xdfabc\xdf\=offset=7
142\= Expect no match
143 X\xdfabcd\=offset=6
144
145/\x{100}/IB,utf
146
147/\x{1000}/IB,utf
148
149/\x{10000}/IB,utf
150
151/\x{100000}/IB,utf
152
153/\x{10ffff}/IB,utf
154
155/[\x{ff}]/IB,utf
156
157/[\x{100}]/IB,utf
158
159/\x80/IB,utf
160
161/\xff/IB,utf
162
163/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
164 \x{D55c}\x{ad6d}\x{C5B4}
165
166/\x{65e5}\x{672c}\x{8a9e}/IB,utf
167 \x{65e5}\x{672c}\x{8a9e}
168
169/\x{80}/IB,utf
170
171/\x{084}/IB,utf
172
173/\x{104}/IB,utf
174
175/\x{861}/IB,utf
176
177/\x{212ab}/IB,utf
178
179/[^ab\xC0-\xF0]/IB,utf
180 \x{f1}
181 \x{bf}
182 \x{100}
183 \x{1000}
184\= Expect no match
185 \x{c0}
186 \x{f0}
187
188/Ā{3,4}/IB,utf
189 \x{100}\x{100}\x{100}\x{100\x{100}
190
191/(\x{100}+|x)/IB,utf
192
193/(\x{100}*a|x)/IB,utf
194
195/(\x{100}{0,2}a|x)/IB,utf
196
197/(\x{100}{1,2}a|x)/IB,utf
198
199/\x{100}/IB,utf
200
201/a\x{100}\x{101}*/IB,utf
202
203/a\x{100}\x{101}+/IB,utf
204
205/[^\x{c4}]/IB
206
207/[\x{100}]/IB,utf
208 \x{100}
209 Z\x{100}
210 \x{100}Z
211
212/[\xff]/IB,utf
213 >\x{ff}<
214
215/[^\xff]/IB,utf
216
217/\x{100}abc(xyz(?1))/IB,utf
218
219/\777/I,utf
220 \x{1ff}
221 \777
222
223/\x{100}+\x{200}/IB,utf
224
225/\x{100}+X/IB,utf
226
227/^[\QĀ\E-\QŐ\E/B,utf
228
229# This tests the stricter UTF-8 check according to RFC 3629.
230
231/X/utf
232\= Expect UTF-8 errors
233 \x{d800}
234 \x{da00}
235 \x{dfff}
236 \x{110000}
237 \x{2000000}
238 \x{7fffffff}
239\= Expect no match
240 \x{d800}\=no_utf_check
241 \x{da00}\=no_utf_check
242 \x{dfff}\=no_utf_check
243 \x{110000}\=no_utf_check
244 \x{2000000}\=no_utf_check
245 \x{7fffffff}\=no_utf_check
246
247/(*UTF8)\x{1234}/
248 abcd\x{1234}pqr
249
250/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I
251
252/\h/I,utf
253 ABC\x{09}
254 ABC\x{20}
255 ABC\x{a0}
256 ABC\x{1680}
257 ABC\x{180e}
258 ABC\x{2000}
259 ABC\x{202f}
260 ABC\x{205f}
261 ABC\x{3000}
262
263/\v/I,utf
264 ABC\x{0a}
265 ABC\x{0b}
266 ABC\x{0c}
267 ABC\x{0d}
268 ABC\x{85}
269 ABC\x{2028}
270
271/\h*A/I,utf
272 CDBABC
273
274/\v+A/I,utf
275
276/\s?xxx\s/I,utf
277
278/\sxxx\s/I,utf,tables=2
279 AB\x{85}xxx\x{a0}XYZ
280 AB\x{a0}xxx\x{85}XYZ
281
282/\S \S/I,utf,tables=2
283 \x{a2} \x{84}
284 A Z
285
286/a+/utf
287 a\x{123}aa\=offset=1
288 a\x{123}aa\=offset=3
289 a\x{123}aa\=offset=4
290\= Expect bad offset value
291 a\x{123}aa\=offset=6
292\= Expect bad UTF-8 offset
293 a\x{123}aa\=offset=2
294\= Expect no match
295 a\x{123}aa\=offset=5
296
297/\x{1234}+/Ii,utf
298
299/\x{1234}+?/Ii,utf
300
301/\x{1234}++/Ii,utf
302
303/\x{1234}{2}/Ii,utf
304
305/[^\x{c4}]/IB,utf
306
307/X+\x{200}/IB,utf
308
309/\R/I,utf
310
311/\777/IB,utf
312
313/\w+\x{C4}/B,utf
314 a\x{C4}\x{C4}
315
316/\w+\x{C4}/B,utf,tables=2
317 a\x{C4}\x{C4}
318
319/\W+\x{C4}/B,utf
320 !\x{C4}
321
322/\W+\x{C4}/B,utf,tables=2
323 !\x{C4}
324
325/\W+\x{A1}/B,utf
326 !\x{A1}
327
328/\W+\x{A1}/B,utf,tables=2
329 !\x{A1}
330
331/X\s+\x{A0}/B,utf
332 X\x20\x{A0}\x{A0}
333
334/X\s+\x{A0}/B,utf,tables=2
335 X\x20\x{A0}\x{A0}
336
337/\S+\x{A0}/B,utf
338 X\x{A0}\x{A0}
339
340/\S+\x{A0}/B,utf,tables=2
341 X\x{A0}\x{A0}
342
343/\x{a0}+\s!/B,utf
344 \x{a0}\x20!
345
346/\x{a0}+\s!/B,utf,tables=2
347 \x{a0}\x20!
348
349/A/utf
350 \x{ff000041}
351 \x{7f000041}
352
353/(*UTF8)abc/never_utf
354
355/abc/utf,never_utf
356
357/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
358
359/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
360
361/AB\x{1fb0}/IB,utf
362
363/AB\x{1fb0}/IBi,utf
364
365/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
366 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
367 \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
368
369/[â±¥]/Bi,utf
370
371/[^â±¥]/Bi,utf
372
373/\h/I
374
375/\v/I
376
377/\R/I
378
379/[[:blank:]]/B,ucp
380
381/\x{212a}+/Ii,utf
382 KKkk\x{212a}
383
384/s+/Ii,utf
385 SSss\x{17f}
386
387/\x{100}*A/IB,utf
388 A
389
390/\x{100}*\d(?R)/IB,utf
391
392/[Z\x{100}]/IB,utf
393 Z\x{100}
394 \x{100}
395 \x{100}Z
396
397/[z-\x{100}]/IB,utf
398
399/[z\Qa-d]Ā\E]/IB,utf
400 \x{100}
401 Ā
402
403/[ab\x{100}]abc(xyz(?1))/IB,utf
404
405/\x{100}*\s/IB,utf
406
407/\x{100}*\d/IB,utf
408
409/\x{100}*\w/IB,utf
410
411/\x{100}*\D/IB,utf
412
413/\x{100}*\S/IB,utf
414
415/\x{100}*\W/IB,utf
416
417/[\x{105}-\x{109}]/IBi,utf
418 \x{104}
419 \x{105}
420 \x{109}
421\= Expect no match
422 \x{100}
423 \x{10a}
424
425/[z-\x{100}]/IBi,utf
426 Z
427 z
428 \x{39c}
429 \x{178}
430 |
431 \x{80}
432 \x{ff}
433 \x{100}
434 \x{101}
435\= Expect no match
436 \x{102}
437 Y
438 y
439
440/[z-\x{100}]/IBi,utf
441
442/\x{3a3}B/IBi,utf
443
444/abc/utf,replace
445 abc
446
447/(?<=(a)(?-1))x/I,utf
448 a\x80zx\=offset=3
449
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700450/[\W\p{Any}]/B
451 abc
452 123
453
454/[\W\pL]/B
455 abc
456\= Expect no match
457 123
458
459/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':Æ¿)/utf
460
461/[\s[:^ascii:]]/B,ucp
462
463# A special extra option allows excaped surrogate code points in 8-bit mode,
464# but subjects containing them must not be UTF-checked.
465
466/\x{d800}/I,utf,allow_surrogate_escapes
467 \x{d800}\=no_utf_check
468
469/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
470 \x{dfff}\x{df01}\=no_utf_check
471
472# This has different starting code units in 8-bit mode.
473
474/^[^ab]/IB,utf
475 c
476 \x{ff}
477 \x{100}
478\= Expect no match
479 aaa
Elliott Hughes0c26e192019-08-07 12:24:46 -0700480
481# Offsets are different in 8-bit mode.
482
483/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
484 123abcáyzabcdef789abcሴqr
485
486# Check name length with non-ASCII characters
487
488/(?'ABáC678901234567890123456789012'...)/utf
489
490/(?'ABáC6789012345678901234567890123'...)/utf
491
492/(?'ABZC6789012345678901234567890123'...)/utf
493
494/(?(n/utf
495
496/(?(á/utf
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700497
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700498# Invalid UTF-8 tests
499
500/.../g,match_invalid_utf
501 abcd\x80wxzy\x80pqrs
502 abcd\x{80}wxzy\x80pqrs
503
504/abc/match_invalid_utf
505 ab\x80ab\=ph
506\= Expect no match
507 ab\x80cdef\=ph
508
509/ab$/match_invalid_utf
510 ab\x80cdeab
511\= Expect no match
512 ab\x80cde
513
514/.../g,match_invalid_utf
515 abcd\x{80}wxzy\x80pqrs
516
517/(?<=x)../g,match_invalid_utf
518 abcd\x{80}wxzy\x80pqrs
519 abcd\x{80}wxzy\x80xpqrs
520
521/X$/match_invalid_utf
522\= Expect no match
523 X\xc4
524
525/(?<=..)X/match_invalid_utf,aftertext
526 AB\x80AQXYZ
527 AB\x80AQXYZ\=offset=5
528 AB\x80\x80AXYZXC\=offset=5
529\= Expect no match
530 AB\x80XYZ
531 AB\x80XYZ\=offset=3
532 AB\xfeXYZ
533 AB\xffXYZ\=offset=3
534 AB\x80AXYZ
535 AB\x80AXYZ\=offset=4
536 AB\x80\x80AXYZ\=offset=5
537
538/.../match_invalid_utf
539 AB\xc4CCC
540\= Expect no match
541 A\x{d800}B
542 A\x{110000}B
543 A\xc4B
544
545/\bX/match_invalid_utf
546 A\x80X
547
548/\BX/match_invalid_utf
549\= Expect no match
550 A\x80X
551
552/(?<=...)X/match_invalid_utf
553 AAA\x80BBBXYZ
554\= Expect no match
555 AAA\x80BXYZ
556 AAA\x80BBXYZ
557
558# -------------------------------------
559
560/(*UTF)(?=\x{123})/I
561
562/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
563
564/[󿾟,]/BI,utf
565
566/[\x{fff4}-\x{ffff8}]/I,utf
567
568/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
569
570/[\xff\x{ffff}]/I,utf
571
572/[\xff\x{ff}]/I,utf
573 abc\x{ff}def
574
575/[\xff\x{ff}]/I
576 abc\x{ff}def
577
578/[Ss]/I
579
580/[Ss]/I,utf
581
582/(?:\x{ff}|\x{3000})/I,utf
583
584/x/utf
585 abxyz
586 \x80\=startchar
587 abc\x80\=startchar
588 abc\x80\=startchar,offset=3
589
590/\x{c1}+\x{e1}/iIB,ucp
591 \x{c1}\x{c1}\x{c1}
592 \x{e1}\x{e1}\x{e1}
593
594/a|\x{c1}/iI,ucp
595 \x{e1}xxx
596
597/a|\x{c1}/iI,utf
598 \x{e1}xxx
599
600/\x{c1}|\x{e1}/iI,ucp
601
602/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
603 X\x{e1}Y
604
605/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
606 X\x{c1}Y
607
608# Without UTF or UCP characters > 127 have only one case in the default locale.
609
610/X(\x{e1})Y/replace=>\U$1<,substitute_extended
611 X\x{e1}Y
612
Elliott Hughes3435c422020-12-04 13:18:28 -0800613/A/utf,match_invalid_utf,caseless
614 \xe5A
615
616/\bch\b/utf,match_invalid_utf
617 qchq\=ph
618 qchq\=ps
619
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100620# End of testinput10