jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 1 | Index: source/data/brkitr/word.txt |
| 2 | =================================================================== |
| 3 | --- source/data/brkitr/word.txt (revision 259715) |
| 4 | +++ source/data/brkitr/word.txt (working copy) |
| 5 | @@ -35,10 +35,16 @@ |
| 6 | $ALetter = [\p{Word_Break = ALetter}]; |
| 7 | $Single_Quote = [\p{Word_Break = Single_Quote}]; |
| 8 | $Double_Quote = [\p{Word_Break = Double_Quote}]; |
| 9 | -$MidNumLet = [\p{Word_Break = MidNumLet}]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 10 | +# Remove two full stop characters from $MidNumLet and add them to $MidNum |
| 11 | +# to break a hostname into its components at the cost of breaking |
| 12 | +# 'e.g.' and 'i.e.' as well. |
| 13 | +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
| 14 | +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
| 15 | +# while rules 6/7 are reverted to the old behavior we want. |
| 16 | +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 17 | $MidLetter = [\p{Word_Break = MidLetter}]; |
| 18 | -$MidNum = [\p{Word_Break = MidNum}]; |
| 19 | -$Numeric = [\p{Word_Break = Numeric}]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 20 | +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
| 21 | +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 22 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 23 | |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 24 | $Han = [:Han:]; |
| 25 | Index: source/data/brkitr/line.txt |
| 26 | =================================================================== |
| 27 | --- source/data/brkitr/line.txt (revision 259715) |
| 28 | +++ source/data/brkitr/line.txt (working copy) |
| 29 | @@ -12,9 +12,8 @@ |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 30 | # This is only because of a limitation of ICU break engine implementation, |
| 31 | # not because the older behavior is desirable. |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 32 | |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 33 | -# |
| 34 | -# Character Classes defined by TR 14. |
| 35 | -# |
| 36 | +# CHROME: 1. Use line_ja.txt to apply small kana rules in all locales. |
| 37 | +# 2. Adjust CL, OP, and IS to handle 'comma-variants' consistently. |
| 38 | |
| 39 | !!chain; |
| 40 | !!LBCMNoChain; |
| 41 | @@ -57,14 +56,14 @@ |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 42 | # |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 43 | |
| 44 | $AI = [:LineBreak = Ambiguous:]; |
| 45 | -$AL = [:LineBreak = Alphabetic:]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 46 | +$AL = [[:LineBreak = Alphabetic:] - [\u23B4\u23B5]]; |
| 47 | $BA = [:LineBreak = Break_After:]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 48 | $BB = [:LineBreak = Break_Before:]; |
| 49 | $BK = [:LineBreak = Mandatory_Break:]; |
| 50 | $B2 = [:LineBreak = Break_Both:]; |
| 51 | $CB = [:LineBreak = Contingent_Break:]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 52 | $CJ = [:LineBreak = Conditional_Japanese_Starter:]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 53 | -$CL = [:LineBreak = Close_Punctuation:]; |
| 54 | +$CL = [[:LineBreak = Close_Punctuation:] [\uFE51\uFE10\u23B5]]; |
| 55 | $CM = [:LineBreak = Combining_Mark:]; |
| 56 | $CP = [:LineBreak = Close_Parenthesis:]; |
| 57 | $CR = [:LineBreak = Carriage_Return:]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 58 | @@ -74,16 +73,16 @@ |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 59 | $HY = [:LineBreak = Hyphen:]; |
| 60 | $H2 = [:LineBreak = H2:]; |
| 61 | $H3 = [:LineBreak = H3:]; |
| 62 | -$ID = [:LineBreak = Ideographic:]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 63 | +$ID = [[[:LineBreak = Ideographic:] $CJ] - [\uFE51]]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 64 | $IN = [:LineBreak = Inseperable:]; |
| 65 | -$IS = [:LineBreak = Infix_Numeric:]; |
| 66 | +$IS = [[:LineBreak = Infix_Numeric:] - [\uFE10]]; |
| 67 | $JL = [:LineBreak = JL:]; |
| 68 | $JV = [:LineBreak = JV:]; |
| 69 | $JT = [:LineBreak = JT:]; |
| 70 | $LF = [:LineBreak = Line_Feed:]; |
| 71 | $NL = [:LineBreak = Next_Line:]; |
jshin@chromium.org | cc7f2c2 | 2014-05-24 00:18:36 +0000 | [diff] [blame] | 72 | -$NS = [[:LineBreak = Nonstarter:] $CJ]; |
| 73 | +$NS = [:LineBreak = Nonstarter:]; |
jshin@chromium.org | c1025d0 | 2014-04-03 00:18:15 +0000 | [diff] [blame] | 74 | $NU = [:LineBreak = Numeric:]; |
| 75 | -$OP = [:LineBreak = Open_Punctuation:]; |
| 76 | +$OP = [[:LineBreak = Open_Punctuation:] \u23B4]; |
| 77 | $PO = [:LineBreak = Postfix_Numeric:]; |
| 78 | $PR = [:LineBreak = Prefix_Numeric:]; |