blob: 9a1307a9078aab4bc62e118ca5297ad4db943049 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 2007, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: unisetspan.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2007mar01
16* created by: Markus W. Scherer
17*/
18
19#ifndef __UNISETSPAN_H__
20#define __UNISETSPAN_H__
21
22#include "unicode/utypes.h"
23#include "unicode/uniset.h"
24
25U_NAMESPACE_BEGIN
26
27/*
28 * Implement span() etc. for a set with strings.
29 * Avoid recursion because of its exponential complexity.
30 * Instead, try multiple paths at once and track them with an IndexList.
31 */
32class UnicodeSetStringSpan : public UMemory {
33public:
34 /*
35 * Which span() variant will be used?
36 * The object is either built for one variant and used once,
37 * or built for all and may be used many times.
38 */
39 enum {
40 FWD = 0x20,
41 BACK = 0x10,
42 UTF16 = 8,
43 UTF8 = 4,
44 CONTAINED = 2,
45 NOT_CONTAINED = 1,
46
47 ALL = 0x3f,
48
49 FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED,
50 FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED,
51 FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED,
52 FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED,
53 BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED,
54 BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
55 BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED,
56 BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED
57 };
58
59 UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
60
61 // Copy constructor. Assumes which==ALL for a frozen set.
62 UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
63
64 ~UnicodeSetStringSpan();
65
66 /*
67 * Do the strings need to be checked in span() etc.?
Victor Changce4bf3c2021-01-19 16:34:24 +000068 * @return true if strings need to be checked (call span() here),
69 * false if not (use a BMPSet for best performance).
Victor Chang73229502020-09-17 13:39:19 +010070 */
71 inline UBool needsStringSpanUTF16();
72 inline UBool needsStringSpanUTF8();
73
74 // For fast UnicodeSet::contains(c).
75 inline UBool contains(UChar32 c) const;
76
77 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78
79 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
80
81 int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82
83 int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
84
85private:
86 // Special spanLength byte values.
87 enum {
88 // The spanLength is >=0xfe.
89 LONG_SPAN=0xfe,
90 // All code points in the string are contained in the parent set.
91 ALL_CP_CONTAINED=0xff
92 };
93
94 // Add a starting or ending string character to the spanNotSet
95 // so that a character span ends before any string.
96 void addToSpanNotSet(UChar32 c);
97
98 int32_t spanNot(const UChar *s, int32_t length) const;
99 int32_t spanNotBack(const UChar *s, int32_t length) const;
100 int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
101 int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
102
103 // Set for span(). Same as parent but without strings.
104 UnicodeSet spanSet;
105
106 // Set for span(not contained).
107 // Same as spanSet, plus characters that start or end strings.
108 UnicodeSet *pSpanNotSet;
109
110 // The strings of the parent set.
111 const UVector &strings;
112
113 // Pointer to the UTF-8 string lengths.
114 // Also pointer to further allocated storage for meta data and
115 // UTF-8 string contents as necessary.
116 int32_t *utf8Lengths;
117
118 // Pointer to the part of the (utf8Lengths) memory block that stores
119 // the lengths of span(), spanBack() etc. for each string.
120 uint8_t *spanLengths;
121
122 // Pointer to the part of the (utf8Lengths) memory block that stores
123 // the UTF-8 versions of the parent set's strings.
124 uint8_t *utf8;
125
126 // Number of bytes for all UTF-8 versions of strings together.
127 int32_t utf8Length;
128
129 // Maximum lengths of relevant strings.
130 int32_t maxLength16;
131 int32_t maxLength8;
132
133 // Set up for all variants of span()?
134 UBool all;
135
136 // Memory for small numbers and lengths of strings.
137 // For example, for 8 strings:
138 // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
139 // = 112 bytes = int32_t[28].
140 int32_t staticLengths[32];
141};
142
143UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
144 return (UBool)(maxLength16!=0);
145}
146
147UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
148 return (UBool)(maxLength8!=0);
149}
150
151UBool UnicodeSetStringSpan::contains(UChar32 c) const {
152 return spanSet.contains(c);
153}
154
155U_NAMESPACE_END
156
157#endif