| /* |
| * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| package java.util.regex; |
| |
| import java.util.HashMap; |
| import java.util.Locale; |
| import java.util.regex.Pattern.CharPredicate; |
| import java.util.regex.Pattern.BmpCharPredicate; |
| |
| class CharPredicates { |
| |
| static final CharPredicate ALPHABETIC() { |
| return Character::isAlphabetic; |
| } |
| |
| // \p{gc=Decimal_Number} |
| static final CharPredicate DIGIT() { |
| return Character::isDigit; |
| } |
| |
| static final CharPredicate LETTER() { |
| return Character::isLetter; |
| } |
| |
| static final CharPredicate IDEOGRAPHIC() { |
| return Character::isIdeographic; |
| } |
| |
| static final CharPredicate LOWERCASE() { |
| return Character::isLowerCase; |
| } |
| |
| static final CharPredicate UPPERCASE() { |
| return Character::isUpperCase; |
| } |
| |
| static final CharPredicate TITLECASE() { |
| return Character::isTitleCase; |
| } |
| |
| // \p{Whitespace} |
| static final CharPredicate WHITE_SPACE() { |
| return ch -> |
| ((((1 << Character.SPACE_SEPARATOR) | |
| (1 << Character.LINE_SEPARATOR) | |
| (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) |
| != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); |
| } |
| |
| // \p{gc=Control} |
| static final CharPredicate CONTROL() { |
| return ch -> Character.getType(ch) == Character.CONTROL; |
| } |
| |
| // \p{gc=Punctuation} |
| static final CharPredicate PUNCTUATION() { |
| return ch -> |
| ((((1 << Character.CONNECTOR_PUNCTUATION) | |
| (1 << Character.DASH_PUNCTUATION) | |
| (1 << Character.START_PUNCTUATION) | |
| (1 << Character.END_PUNCTUATION) | |
| (1 << Character.OTHER_PUNCTUATION) | |
| (1 << Character.INITIAL_QUOTE_PUNCTUATION) | |
| (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) |
| != 0; |
| } |
| |
| // \p{gc=Decimal_Number} |
| // \p{Hex_Digit} -> PropList.txt: Hex_Digit |
| static final CharPredicate HEX_DIGIT() { |
| return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) || |
| (ch >= 0x0041 && ch <= 0x0046) || |
| (ch >= 0x0061 && ch <= 0x0066) || |
| (ch >= 0xFF10 && ch <= 0xFF19) || |
| (ch >= 0xFF21 && ch <= 0xFF26) || |
| (ch >= 0xFF41 && ch <= 0xFF46)); |
| } |
| |
| static final CharPredicate ASSIGNED() { |
| return ch -> Character.getType(ch) != Character.UNASSIGNED; |
| } |
| |
| // PropList.txt:Noncharacter_Code_Point |
| static final CharPredicate NONCHARACTER_CODE_POINT() { |
| return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); |
| } |
| |
| // \p{alpha} |
| // \p{digit} |
| static final CharPredicate ALNUM() { |
| return ALPHABETIC().union(DIGIT()); |
| } |
| |
| // \p{Whitespace} -- |
| // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 |
| // \p{gc=Line_Separator} |
| // \p{gc=Paragraph_Separator}] |
| static final CharPredicate BLANK() { |
| return ch -> |
| Character.getType(ch) == Character.SPACE_SEPARATOR || |
| ch == 0x9; // \N{HT} |
| } |
| |
| // [^ |
| // \p{space} |
| // \p{gc=Control} |
| // \p{gc=Surrogate} |
| // \p{gc=Unassigned}] |
| static final CharPredicate GRAPH() { |
| return ch -> |
| ((((1 << Character.SPACE_SEPARATOR) | |
| (1 << Character.LINE_SEPARATOR) | |
| (1 << Character.PARAGRAPH_SEPARATOR) | |
| (1 << Character.CONTROL) | |
| (1 << Character.SURROGATE) | |
| (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) |
| == 0; |
| } |
| |
| // \p{graph} |
| // \p{blank} |
| // -- \p{cntrl} |
| static final CharPredicate PRINT() { |
| return GRAPH().union(BLANK()).and(CONTROL().negate()); |
| } |
| |
| // 200C..200D PropList.txt:Join_Control |
| static final CharPredicate JOIN_CONTROL() { |
| return ch -> ch == 0x200C || ch == 0x200D; |
| } |
| |
| // \p{alpha} |
| // \p{gc=Mark} |
| // \p{digit} |
| // \p{gc=Connector_Punctuation} |
| // \p{Join_Control} 200C..200D |
| static final CharPredicate WORD() { |
| return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) | |
| (1 << Character.ENCLOSING_MARK) | |
| (1 << Character.COMBINING_SPACING_MARK) | |
| (1 << Character.DECIMAL_DIGIT_NUMBER) | |
| (1 << Character.CONNECTOR_PUNCTUATION)) |
| >> Character.getType(ch)) & 1) != 0, |
| JOIN_CONTROL()); |
| } |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| private static CharPredicate getPosixPredicate(String name) { |
| switch (name) { |
| case "ALPHA": return ALPHABETIC(); |
| case "LOWER": return LOWERCASE(); |
| case "UPPER": return UPPERCASE(); |
| case "SPACE": return WHITE_SPACE(); |
| case "PUNCT": return PUNCTUATION(); |
| case "XDIGIT": return HEX_DIGIT(); |
| case "ALNUM": return ALNUM(); |
| case "CNTRL": return CONTROL(); |
| case "DIGIT": return DIGIT(); |
| case "BLANK": return BLANK(); |
| case "GRAPH": return GRAPH(); |
| case "PRINT": return PRINT(); |
| default: return null; |
| } |
| } |
| |
| private static CharPredicate getUnicodePredicate(String name) { |
| switch (name) { |
| case "ALPHABETIC": return ALPHABETIC(); |
| case "ASSIGNED": return ASSIGNED(); |
| case "CONTROL": return CONTROL(); |
| case "HEXDIGIT": return HEX_DIGIT(); |
| case "IDEOGRAPHIC": return IDEOGRAPHIC(); |
| case "JOINCONTROL": return JOIN_CONTROL(); |
| case "LETTER": return LETTER(); |
| case "LOWERCASE": return LOWERCASE(); |
| case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT(); |
| case "TITLECASE": return TITLECASE(); |
| case "PUNCTUATION": return PUNCTUATION(); |
| case "UPPERCASE": return UPPERCASE(); |
| case "WHITESPACE": return WHITE_SPACE(); |
| case "WORD": return WORD(); |
| case "WHITE_SPACE": return WHITE_SPACE(); |
| case "HEX_DIGIT": return HEX_DIGIT(); |
| case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT(); |
| case "JOIN_CONTROL": return JOIN_CONTROL(); |
| default: return null; |
| } |
| } |
| |
| public static CharPredicate forUnicodeProperty(String propName) { |
| propName = propName.toUpperCase(Locale.ROOT); |
| CharPredicate p = getUnicodePredicate(propName); |
| if (p != null) |
| return p; |
| return getPosixPredicate(propName); |
| } |
| |
| public static CharPredicate forPOSIXName(String propName) { |
| return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH)); |
| } |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Returns a predicate matching all characters belong to a named |
| * UnicodeScript. |
| */ |
| static CharPredicate forUnicodeScript(String name) { |
| final Character.UnicodeScript script; |
| try { |
| script = Character.UnicodeScript.forName(name); |
| return ch -> script == Character.UnicodeScript.of(ch); |
| } catch (IllegalArgumentException iae) {} |
| return null; |
| } |
| |
| /** |
| * Returns a predicate matching all characters in a UnicodeBlock. |
| */ |
| static CharPredicate forUnicodeBlock(String name) { |
| final Character.UnicodeBlock block; |
| try { |
| block = Character.UnicodeBlock.forName(name); |
| return ch -> block == Character.UnicodeBlock.of(ch); |
| } catch (IllegalArgumentException iae) {} |
| return null; |
| } |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| // unicode categories, aliases, properties, java methods ... |
| |
| static CharPredicate forProperty(String name) { |
| // Unicode character property aliases, defined in |
| // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt |
| switch (name) { |
| case "Cn": return category(1<<Character.UNASSIGNED); |
| case "Lu": return category(1<<Character.UPPERCASE_LETTER); |
| case "Ll": return category(1<<Character.LOWERCASE_LETTER); |
| case "Lt": return category(1<<Character.TITLECASE_LETTER); |
| case "Lm": return category(1<<Character.MODIFIER_LETTER); |
| case "Lo": return category(1<<Character.OTHER_LETTER); |
| case "Mn": return category(1<<Character.NON_SPACING_MARK); |
| case "Me": return category(1<<Character.ENCLOSING_MARK); |
| case "Mc": return category(1<<Character.COMBINING_SPACING_MARK); |
| case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER); |
| case "Nl": return category(1<<Character.LETTER_NUMBER); |
| case "No": return category(1<<Character.OTHER_NUMBER); |
| case "Zs": return category(1<<Character.SPACE_SEPARATOR); |
| case "Zl": return category(1<<Character.LINE_SEPARATOR); |
| case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR); |
| case "Cc": return category(1<<Character.CONTROL); |
| case "Cf": return category(1<<Character.FORMAT); |
| case "Co": return category(1<<Character.PRIVATE_USE); |
| case "Cs": return category(1<<Character.SURROGATE); |
| case "Pd": return category(1<<Character.DASH_PUNCTUATION); |
| case "Ps": return category(1<<Character.START_PUNCTUATION); |
| case "Pe": return category(1<<Character.END_PUNCTUATION); |
| case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION); |
| case "Po": return category(1<<Character.OTHER_PUNCTUATION); |
| case "Sm": return category(1<<Character.MATH_SYMBOL); |
| case "Sc": return category(1<<Character.CURRENCY_SYMBOL); |
| case "Sk": return category(1<<Character.MODIFIER_SYMBOL); |
| case "So": return category(1<<Character.OTHER_SYMBOL); |
| case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION); |
| case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION); |
| case "L": return category(((1<<Character.UPPERCASE_LETTER) | |
| (1<<Character.LOWERCASE_LETTER) | |
| (1<<Character.TITLECASE_LETTER) | |
| (1<<Character.MODIFIER_LETTER) | |
| (1<<Character.OTHER_LETTER))); |
| case "M": return category(((1<<Character.NON_SPACING_MARK) | |
| (1<<Character.ENCLOSING_MARK) | |
| (1<<Character.COMBINING_SPACING_MARK))); |
| case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) | |
| (1<<Character.LETTER_NUMBER) | |
| (1<<Character.OTHER_NUMBER))); |
| case "Z": return category(((1<<Character.SPACE_SEPARATOR) | |
| (1<<Character.LINE_SEPARATOR) | |
| (1<<Character.PARAGRAPH_SEPARATOR))); |
| case "C": return category(((1<<Character.CONTROL) | |
| (1<<Character.FORMAT) | |
| (1<<Character.PRIVATE_USE) | |
| (1<<Character.SURROGATE) | |
| (1<<Character.UNASSIGNED))); // Other |
| case "P": return category(((1<<Character.DASH_PUNCTUATION) | |
| (1<<Character.START_PUNCTUATION) | |
| (1<<Character.END_PUNCTUATION) | |
| (1<<Character.CONNECTOR_PUNCTUATION) | |
| (1<<Character.OTHER_PUNCTUATION) | |
| (1<<Character.INITIAL_QUOTE_PUNCTUATION) | |
| (1<<Character.FINAL_QUOTE_PUNCTUATION))); |
| case "S": return category(((1<<Character.MATH_SYMBOL) | |
| (1<<Character.CURRENCY_SYMBOL) | |
| (1<<Character.MODIFIER_SYMBOL) | |
| (1<<Character.OTHER_SYMBOL))); |
| case "LC": return category(((1<<Character.UPPERCASE_LETTER) | |
| (1<<Character.LOWERCASE_LETTER) | |
| (1<<Character.TITLECASE_LETTER))); |
| case "LD": return category(((1<<Character.UPPERCASE_LETTER) | |
| (1<<Character.LOWERCASE_LETTER) | |
| (1<<Character.TITLECASE_LETTER) | |
| (1<<Character.MODIFIER_LETTER) | |
| (1<<Character.OTHER_LETTER) | |
| (1<<Character.DECIMAL_DIGIT_NUMBER))); |
| case "L1": return range(0x00, 0xFF); // Latin-1 |
| case "all": return Pattern.ALL(); |
| // Posix regular expression character classes, defined in |
| // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html |
| case "ASCII": return range(0x00, 0x7F); // ASCII |
| case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters |
| case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters |
| case "Blank": return ctype(ASCII.BLANK); // Space and tab characters |
| case "Cntrl": return ctype(ASCII.CNTRL); // Control characters |
| case "Digit": return range('0', '9'); // Numeric characters |
| case "Graph": return ctype(ASCII.GRAPH); // printable and visible |
| case "Lower": return range('a', 'z'); // Lower-case alphabetic |
| case "Print": return range(0x20, 0x7E); // Printable characters |
| case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters |
| case "Space": return ctype(ASCII.SPACE); // Space characters |
| case "Upper": return range('A', 'Z'); // Upper-case alphabetic |
| case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits |
| |
| // Java character properties, defined by methods in Character.java |
| case "javaLowerCase": return java.lang.Character::isLowerCase; |
| case "javaUpperCase": return Character::isUpperCase; |
| case "javaAlphabetic": return java.lang.Character::isAlphabetic; |
| case "javaIdeographic": return java.lang.Character::isIdeographic; |
| case "javaTitleCase": return java.lang.Character::isTitleCase; |
| case "javaDigit": return java.lang.Character::isDigit; |
| case "javaDefined": return java.lang.Character::isDefined; |
| case "javaLetter": return java.lang.Character::isLetter; |
| case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit; |
| case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart; |
| case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart; |
| case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart; |
| case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart; |
| case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable; |
| case "javaSpaceChar": return java.lang.Character::isSpaceChar; |
| case "javaWhitespace": return java.lang.Character::isWhitespace; |
| case "javaISOControl": return java.lang.Character::isISOControl; |
| case "javaMirrored": return java.lang.Character::isMirrored; |
| default: return null; |
| } |
| } |
| |
| private static CharPredicate category(final int typeMask) { |
| return ch -> (typeMask & (1 << Character.getType(ch))) != 0; |
| } |
| |
| private static CharPredicate range(final int lower, final int upper) { |
| return (BmpCharPredicate)ch -> lower <= ch && ch <= upper; |
| } |
| |
| private static CharPredicate ctype(final int ctype) { |
| return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype); |
| } |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Posix ASCII variants, not in the lookup map |
| */ |
| static final BmpCharPredicate ASCII_DIGIT() { |
| return ch -> ch < 128 && ASCII.isDigit(ch); |
| } |
| static final BmpCharPredicate ASCII_WORD() { |
| return ch -> ch < 128 && ASCII.isWord(ch); |
| } |
| static final BmpCharPredicate ASCII_SPACE() { |
| return ch -> ch < 128 && ASCII.isSpace(ch); |
| } |
| |
| } |