Ben Gruver | 324c464 | 2011-11-15 16:02:09 -0800 | [diff] [blame^] | 1 | /** \file |
| 2 | * Defines the basic structures used to manipulate character |
| 3 | * streams from any input source. Any character size and encoding |
| 4 | * can in theory be used, so long as a set of functinos is provided that |
| 5 | * can return a 32 bit Integer representation of their characters amd efficiently mark and revert |
| 6 | * to specific offsets into their input streams. |
| 7 | */ |
| 8 | #ifndef _ANTLR3_INPUT_H |
| 9 | #define _ANTLR3_INPUT_H |
| 10 | |
| 11 | // [The "BSD licence"] |
| 12 | // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC |
| 13 | // http://www.temporal-wave.com |
| 14 | // http://www.linkedin.com/in/jimidle |
| 15 | // |
| 16 | // All rights reserved. |
| 17 | // |
| 18 | // Redistribution and use in source and binary forms, with or without |
| 19 | // modification, are permitted provided that the following conditions |
| 20 | // are met: |
| 21 | // 1. Redistributions of source code must retain the above copyright |
| 22 | // notice, this list of conditions and the following disclaimer. |
| 23 | // 2. Redistributions in binary form must reproduce the above copyright |
| 24 | // notice, this list of conditions and the following disclaimer in the |
| 25 | // documentation and/or other materials provided with the distribution. |
| 26 | // 3. The name of the author may not be used to endorse or promote products |
| 27 | // derived from this software without specific prior written permission. |
| 28 | // |
| 29 | // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| 30 | // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 31 | // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| 32 | // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 33 | // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| 34 | // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 35 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 36 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 37 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 38 | // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 39 | |
| 40 | #include <antlr3defs.h> |
| 41 | #include <antlr3string.h> |
| 42 | #include <antlr3commontoken.h> |
| 43 | #include <antlr3intstream.h> |
| 44 | #include <antlr3convertutf.h> |
| 45 | |
| 46 | #ifdef __cplusplus |
| 47 | extern "C" { |
| 48 | #endif |
| 49 | |
| 50 | |
| 51 | |
| 52 | /// Master context structure for an ANTLR3 C runtime based input stream. |
| 53 | /// \ingroup apistructures |
| 54 | /// |
| 55 | typedef struct ANTLR3_INPUT_STREAM_struct |
| 56 | { |
| 57 | /** Interfaces that provide streams must all provide |
| 58 | * a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM |
| 59 | * is no different. |
| 60 | */ |
| 61 | pANTLR3_INT_STREAM istream; |
| 62 | |
| 63 | /** Whatever super structure is providing the INPUT stream needs a pointer to itself |
| 64 | * so that this can be passed back to it whenever the api functions |
| 65 | * are called back from this interface. |
| 66 | */ |
| 67 | void * super; |
| 68 | |
| 69 | /** Pointer the start of the input string, characters may be |
| 70 | * taken as offsets from here and in original input format encoding. |
| 71 | */ |
| 72 | void * data; |
| 73 | |
| 74 | /** Indicates if the data pointer was allocated by us, and so should be freed |
| 75 | * when the stream dies. |
| 76 | */ |
| 77 | int isAllocated; |
| 78 | |
| 79 | /** String factory for this input stream |
| 80 | */ |
| 81 | pANTLR3_STRING_FACTORY strFactory; |
| 82 | |
| 83 | |
| 84 | /** Pointer to the next character to be consumed from the input data |
| 85 | * This is cast to point at the encoding of the original file that |
| 86 | * was read by the functions installed as pointer in this input stream |
| 87 | * context instance at file/string/whatever load time. |
| 88 | */ |
| 89 | void * nextChar; |
| 90 | |
| 91 | /** Number of characters that can be consumed at this point in time. |
| 92 | * Mostly this is just what is left in the pre-read buffer, but if the |
| 93 | * input source is a stream such as a socket or something then we may |
| 94 | * call special read code to wait for more input. |
| 95 | */ |
| 96 | ANTLR3_UINT32 sizeBuf; |
| 97 | |
| 98 | /** The line number we are traversing in the input file. This gets incremented |
| 99 | * by a newline() call in the lexer grammar actions. |
| 100 | */ |
| 101 | ANTLR3_UINT32 line; |
| 102 | |
| 103 | /** Pointer into the input buffer where the current line |
| 104 | * started. |
| 105 | */ |
| 106 | void * currentLine; |
| 107 | |
| 108 | /** The offset within the current line of the current character |
| 109 | */ |
| 110 | ANTLR3_INT32 charPositionInLine; |
| 111 | |
| 112 | /** Tracks how deep mark() calls are nested |
| 113 | */ |
| 114 | ANTLR3_UINT32 markDepth; |
| 115 | |
| 116 | /** List of mark() points in the input stream |
| 117 | */ |
| 118 | pANTLR3_VECTOR markers; |
| 119 | |
| 120 | /** File name string, set to pointer to memory if |
| 121 | * you set it manually as it will be free()d |
| 122 | */ |
| 123 | pANTLR3_STRING fileName; |
| 124 | |
| 125 | /** File number, needs to be set manually to some file index of your devising. |
| 126 | */ |
| 127 | ANTLR3_UINT32 fileNo; |
| 128 | |
| 129 | /* API */ |
| 130 | |
| 131 | |
| 132 | /** Pointer to function that closes the input stream |
| 133 | */ |
| 134 | void (*close) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 135 | void (*free) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 136 | |
| 137 | /** Pointer to function that resets the input stream |
| 138 | */ |
| 139 | void (*reset) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 140 | |
| 141 | /** Pointer to a function that reuses and resets an input stream by |
| 142 | * supplying a new 'source' |
| 143 | */ |
| 144 | void (*reuse) (struct ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name); |
| 145 | |
| 146 | /** |
| 147 | * Pointer to function that installs a version of LA that always |
| 148 | * returns upper case. Only valid for character streams and creates a case |
| 149 | * insensitive lexer if the lexer tokens are described in upper case. The |
| 150 | * tokens will preserve case in the token text. |
| 151 | */ |
| 152 | void (*setUcaseLA) (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag); |
| 153 | |
| 154 | /** Pointer to function to return input stream element at 1 based |
| 155 | * offset from nextChar. Same as _LA for char stream, but token |
| 156 | * streams etc. have one of these that does other stuff of course. |
| 157 | */ |
| 158 | void * (*_LT) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt); |
| 159 | |
| 160 | /** Pointer to function to return the total size of the input buffer. For streams |
| 161 | * this may be just the total we have available so far. This means of course that |
| 162 | * the input stream must be careful to accumulate enough input so that any backtracking |
| 163 | * can be satisfied. |
| 164 | */ |
| 165 | ANTLR3_UINT32 (*size) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 166 | |
| 167 | /** Pointer to function to return a substring of the input stream. String is returned in allocated |
| 168 | * memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form. |
| 169 | */ |
| 170 | pANTLR3_STRING (*substr) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop); |
| 171 | |
| 172 | /** Pointer to function to return the current line number in the input stream |
| 173 | */ |
| 174 | ANTLR3_UINT32 (*getLine) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 175 | |
| 176 | /** Pointer to function to return the current line buffer in the input stream |
| 177 | * The pointer returned is directly into the input stream so you must copy |
| 178 | * it if you wish to manipulate it without damaging the input stream. Encoding |
| 179 | * is obviously in the same form as the input stream. |
| 180 | * \remark |
| 181 | * - Note taht this function wil lbe inaccurate if setLine is called as there |
| 182 | * is no way at the moment to position the input stream at a particular line |
| 183 | * number offset. |
| 184 | */ |
| 185 | void * (*getLineBuf) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 186 | |
| 187 | /** Pointer to function to return the current offset in the current input stream line |
| 188 | */ |
| 189 | ANTLR3_UINT32 (*getCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input); |
| 190 | |
| 191 | /** Pointer to function to set the current line number in the input stream |
| 192 | */ |
| 193 | void (*setLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line); |
| 194 | |
| 195 | /** Pointer to function to set the current position in the current line. |
| 196 | */ |
| 197 | void (*setCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position); |
| 198 | |
| 199 | /** Pointer to function to override the default newline character that the input stream |
| 200 | * looks for to trigger the line/offset and line buffer recording information. |
| 201 | * \remark |
| 202 | * - By default the chracter '\n' will be installed as the newline trigger character. When this |
| 203 | * character is seen by the consume() function then the current line number is incremented and the |
| 204 | * current line offset is reset to 0. The Pointer for the line of input we are consuming |
| 205 | * is updated to point to the next character after this one in the input stream (which means it |
| 206 | * may become invalid if the last newline character in the file is seen (so watch out). |
| 207 | * - If for some reason you do not want the counters and pointers to be restee, you can set the |
| 208 | * chracter to some impossible character such as '\0' or whatever. |
| 209 | * - This is a single character only, so choose the last character in a sequence of two or more. |
| 210 | * - This is only a simple aid to error reporting - if you have a complicated binary input structure |
| 211 | * it may not be adequate, but you can always override every function in the input stream with your |
| 212 | * own of course, and can even write your own complete input stream set if you like. |
| 213 | * - It is your responsiblity to set a valid character for the input stream type. There is no point |
| 214 | * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never |
| 215 | * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF |
| 216 | */ |
| 217 | void (*SetNewLineChar) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar); |
| 218 | |
| 219 | /// Character that automatically causes an internal line count |
| 220 | /// increment. |
| 221 | /// |
| 222 | ANTLR3_UCHAR newlineChar; |
| 223 | |
| 224 | /// Indicates the size, in 8 bit units, of a single character. Note that |
| 225 | /// the C runtime does not deal with surrogates as this would be |
| 226 | /// slow and complicated. If this is a UTF-8 stream then this field |
| 227 | /// will be set to 0. Generally you are best working internally with 32 bit characters |
| 228 | /// as this is the most efficient. |
| 229 | /// |
| 230 | ANTLR3_UINT8 charByteSize; |
| 231 | |
| 232 | /// Indicates the encoding scheme used in this input stream |
| 233 | /// |
| 234 | ANTLR3_UINT32 encoding; |
| 235 | } |
| 236 | |
| 237 | ANTLR3_INPUT_STREAM; |
| 238 | |
| 239 | |
| 240 | /** \brief Structure for track lex input states as part of mark() |
| 241 | * and rewind() of lexer. |
| 242 | */ |
| 243 | typedef struct ANTLR3_LEX_STATE_struct |
| 244 | { |
| 245 | /** Pointer to the next character to be consumed from the input data |
| 246 | * This is cast to point at the encoding of the original file that |
| 247 | * was read by the functions installed as pointer in this input stream |
| 248 | * context instance at file/string/whatever load time. |
| 249 | */ |
| 250 | void * nextChar; |
| 251 | |
| 252 | /** The line number we are traversing in the input file. This gets incremented |
| 253 | * by a newline() call in the lexer grammer actions. |
| 254 | */ |
| 255 | ANTLR3_UINT32 line; |
| 256 | |
| 257 | /** Pointer into the input buffer where the current line |
| 258 | * started. |
| 259 | */ |
| 260 | void * currentLine; |
| 261 | |
| 262 | /** The offset within the current line of the current character |
| 263 | */ |
| 264 | ANTLR3_INT32 charPositionInLine; |
| 265 | |
| 266 | } |
| 267 | ANTLR3_LEX_STATE; |
| 268 | |
| 269 | /* Prototypes |
| 270 | */ |
| 271 | void antlr38BitSetupStream (pANTLR3_INPUT_STREAM input); |
| 272 | void antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); |
| 273 | void antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); |
| 274 | void antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input); |
| 275 | void antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input); |
| 276 | void antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input); |
| 277 | #ifdef __cplusplus |
| 278 | } |
| 279 | #endif |
| 280 | |
| 281 | #endif /* _ANTLR3_INPUT_H */ |