Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | mat!(uni_literal, r"☃", "☃", Some((0, 3))); |
| 2 | mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); |
| 3 | mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); |
| 4 | mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); |
| 5 | mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); |
| 6 | mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); |
| 7 | mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); |
| 8 | mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); |
| 9 | mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); |
| 10 | mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); |
| 11 | mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); |
| 12 | mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); |
| 13 | mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); |
| 14 | mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); |
| 15 | |
| 16 | // Test the Unicode friendliness of Perl character classes. |
| 17 | mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); |
| 18 | mat!(uni_perl_w_not, r"\w+", "⥡", None); |
| 19 | mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); |
| 20 | mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); |
| 21 | mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); |
| 22 | mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); |
| 23 | mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); |
| 24 | mat!(uni_perl_s_not, r"\s+", "☃", None); |
| 25 | mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); |
| 26 | |
| 27 | // And do the same for word boundaries. |
| 28 | mat!(uni_boundary_none, r"\d\b", "6δ", None); |
| 29 | mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); |
| 30 | mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); |
| 31 | mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); |
| 32 | |
| 33 | // Test general categories. |
| 34 | // |
| 35 | // We should test more, but there's a lot. Write a script to generate more of |
| 36 | // these tests. |
| 37 | mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); |
| 38 | mat!( |
| 39 | uni_class_gencat_close_punctuation, |
| 40 | r"\p{Close_Punctuation}", |
| 41 | "❯", |
| 42 | Some((0, 3)) |
| 43 | ); |
| 44 | mat!( |
| 45 | uni_class_gencat_connector_punctuation, |
| 46 | r"\p{Connector_Punctuation}", |
| 47 | "⁀", |
| 48 | Some((0, 3)) |
| 49 | ); |
| 50 | mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2))); |
| 51 | mat!( |
| 52 | uni_class_gencat_currency_symbol, |
| 53 | r"\p{Currency_Symbol}", |
| 54 | "£", |
| 55 | Some((0, 3)) |
| 56 | ); |
| 57 | mat!( |
| 58 | uni_class_gencat_dash_punctuation, |
| 59 | r"\p{Dash_Punctuation}", |
| 60 | "〰", |
| 61 | Some((0, 3)) |
| 62 | ); |
| 63 | mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4))); |
| 64 | mat!( |
| 65 | uni_class_gencat_enclosing_mark, |
| 66 | r"\p{Enclosing_Mark}", |
| 67 | "\u{A672}", |
| 68 | Some((0, 3)) |
| 69 | ); |
| 70 | mat!( |
| 71 | uni_class_gencat_final_punctuation, |
| 72 | r"\p{Final_Punctuation}", |
| 73 | "⸡", |
| 74 | Some((0, 3)) |
| 75 | ); |
| 76 | mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 77 | // See: https://github.com/rust-lang/regex/issues/719 |
| 78 | mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); |
| 79 | mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 80 | mat!( |
| 81 | uni_class_gencat_initial_punctuation, |
| 82 | r"\p{Initial_Punctuation}", |
| 83 | "⸜", |
| 84 | Some((0, 3)) |
| 85 | ); |
| 86 | mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2))); |
| 87 | mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3))); |
| 88 | mat!( |
| 89 | uni_class_gencat_line_separator, |
| 90 | r"\p{Line_Separator}", |
| 91 | "\u{2028}", |
| 92 | Some((0, 3)) |
| 93 | ); |
| 94 | mat!( |
| 95 | uni_class_gencat_lowercase_letter, |
| 96 | r"\p{Lowercase_Letter}", |
| 97 | "ϛ", |
| 98 | Some((0, 2)) |
| 99 | ); |
| 100 | mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4))); |
| 101 | mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3))); |
| 102 | mat!( |
| 103 | uni_class_gencat_modifier_letter, |
| 104 | r"\p{Modifier_Letter}", |
| 105 | "𖭃", |
| 106 | Some((0, 4)) |
| 107 | ); |
| 108 | mat!( |
| 109 | uni_class_gencat_modifier_symbol, |
| 110 | r"\p{Modifier_Symbol}", |
| 111 | "🏿", |
| 112 | Some((0, 4)) |
| 113 | ); |
| 114 | mat!( |
| 115 | uni_class_gencat_nonspacing_mark, |
| 116 | r"\p{Nonspacing_Mark}", |
| 117 | "\u{1E94A}", |
| 118 | Some((0, 4)) |
| 119 | ); |
| 120 | mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3))); |
| 121 | mat!( |
| 122 | uni_class_gencat_open_punctuation, |
| 123 | r"\p{Open_Punctuation}", |
| 124 | "⦅", |
| 125 | Some((0, 3)) |
| 126 | ); |
| 127 | mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3))); |
| 128 | mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3))); |
| 129 | mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3))); |
| 130 | mat!( |
| 131 | uni_class_gencat_other_punctuation, |
| 132 | r"\p{Other_Punctuation}", |
| 133 | "𞥞", |
| 134 | Some((0, 4)) |
| 135 | ); |
| 136 | mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3))); |
| 137 | mat!( |
| 138 | uni_class_gencat_paragraph_separator, |
| 139 | r"\p{Paragraph_Separator}", |
| 140 | "\u{2029}", |
| 141 | Some((0, 3)) |
| 142 | ); |
| 143 | mat!( |
| 144 | uni_class_gencat_private_use, |
| 145 | r"\p{Private_Use}", |
| 146 | "\u{10FFFD}", |
| 147 | Some((0, 4)) |
| 148 | ); |
| 149 | mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4))); |
| 150 | mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3))); |
| 151 | mat!( |
| 152 | uni_class_gencat_space_separator, |
| 153 | r"\p{Space_Separator}", |
| 154 | "\u{205F}", |
| 155 | Some((0, 3)) |
| 156 | ); |
| 157 | mat!( |
| 158 | uni_class_gencat_spacing_mark, |
| 159 | r"\p{Spacing_Mark}", |
| 160 | "\u{16F7E}", |
| 161 | Some((0, 4)) |
| 162 | ); |
| 163 | mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3))); |
| 164 | mat!( |
| 165 | uni_class_gencat_titlecase_letter, |
| 166 | r"\p{Titlecase_Letter}", |
| 167 | "ῼ", |
| 168 | Some((0, 3)) |
| 169 | ); |
| 170 | mat!( |
| 171 | uni_class_gencat_unassigned, |
| 172 | r"\p{Unassigned}", |
| 173 | "\u{10FFFF}", |
| 174 | Some((0, 4)) |
| 175 | ); |
| 176 | mat!( |
| 177 | uni_class_gencat_uppercase_letter, |
| 178 | r"\p{Uppercase_Letter}", |
| 179 | "Ꝋ", |
| 180 | Some((0, 3)) |
| 181 | ); |
| 182 | |
| 183 | // Test a smattering of properties. |
| 184 | mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3))); |
| 185 | mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4))); |
| 186 | mat!( |
| 187 | uni_class_prop_picto1, |
| 188 | r"\p{extendedpictographic}", |
| 189 | "\u{1FA6E}", |
| 190 | Some((0, 4)) |
| 191 | ); |
| 192 | mat!( |
| 193 | uni_class_prop_picto2, |
| 194 | r"\p{extendedpictographic}", |
| 195 | "\u{1FFFD}", |
| 196 | Some((0, 4)) |
| 197 | ); |
| 198 | |
| 199 | // grapheme_cluster_break |
| 200 | mat!( |
| 201 | uni_class_gcb_prepend, |
| 202 | r"\p{grapheme_cluster_break=prepend}", |
| 203 | "\u{11D46}", |
| 204 | Some((0, 4)) |
| 205 | ); |
| 206 | mat!( |
| 207 | uni_class_gcb_ri1, |
| 208 | r"\p{gcb=regional_indicator}", |
| 209 | "\u{1F1E6}", |
| 210 | Some((0, 4)) |
| 211 | ); |
| 212 | mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4))); |
| 213 | mat!( |
| 214 | uni_class_gcb_ri3, |
| 215 | r"\p{gcb=regionalindicator}", |
| 216 | "\u{1F1FF}", |
| 217 | Some((0, 4)) |
| 218 | ); |
| 219 | mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3))); |
| 220 | mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3))); |
| 221 | |
| 222 | // word_break |
| 223 | mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3))); |
| 224 | mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3))); |
| 225 | mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3))); |
| 226 | mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3))); |
| 227 | mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4))); |
| 228 | |
| 229 | // sentence_break |
| 230 | mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2))); |
| 231 | mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2))); |
| 232 | mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3))); |
| 233 | mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4))); |
| 234 | mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3))); |