blob: 3b01ae6bac668df9df353bbd68f14f2c638a5a3f [file] [log] [blame]
Ben Gruver324c4642011-11-15 16:02:09 -08001/** \file
2 * Defines the basic structures used to manipulate character
3 * streams from any input source. Any character size and encoding
4 * can in theory be used, so long as a set of functinos is provided that
5 * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
6 * to specific offsets into their input streams.
7 */
8#ifndef _ANTLR3_INPUT_H
9#define _ANTLR3_INPUT_H
10
11// [The "BSD licence"]
12// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
13// http://www.temporal-wave.com
14// http://www.linkedin.com/in/jimidle
15//
16// All rights reserved.
17//
18// Redistribution and use in source and binary forms, with or without
19// modification, are permitted provided that the following conditions
20// are met:
21// 1. Redistributions of source code must retain the above copyright
22// notice, this list of conditions and the following disclaimer.
23// 2. Redistributions in binary form must reproduce the above copyright
24// notice, this list of conditions and the following disclaimer in the
25// documentation and/or other materials provided with the distribution.
26// 3. The name of the author may not be used to endorse or promote products
27// derived from this software without specific prior written permission.
28//
29// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
30// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
31// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
32// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
33// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
34// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
38// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39
40#include <antlr3defs.h>
41#include <antlr3string.h>
42#include <antlr3commontoken.h>
43#include <antlr3intstream.h>
44#include <antlr3convertutf.h>
45
46#ifdef __cplusplus
47extern "C" {
48#endif
49
50
51
52/// Master context structure for an ANTLR3 C runtime based input stream.
53/// \ingroup apistructures
54///
55typedef struct ANTLR3_INPUT_STREAM_struct
56{
57 /** Interfaces that provide streams must all provide
58 * a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM
59 * is no different.
60 */
61 pANTLR3_INT_STREAM istream;
62
63 /** Whatever super structure is providing the INPUT stream needs a pointer to itself
64 * so that this can be passed back to it whenever the api functions
65 * are called back from this interface.
66 */
67 void * super;
68
69 /** Pointer the start of the input string, characters may be
70 * taken as offsets from here and in original input format encoding.
71 */
72 void * data;
73
74 /** Indicates if the data pointer was allocated by us, and so should be freed
75 * when the stream dies.
76 */
77 int isAllocated;
78
79 /** String factory for this input stream
80 */
81 pANTLR3_STRING_FACTORY strFactory;
82
83
84 /** Pointer to the next character to be consumed from the input data
85 * This is cast to point at the encoding of the original file that
86 * was read by the functions installed as pointer in this input stream
87 * context instance at file/string/whatever load time.
88 */
89 void * nextChar;
90
91 /** Number of characters that can be consumed at this point in time.
92 * Mostly this is just what is left in the pre-read buffer, but if the
93 * input source is a stream such as a socket or something then we may
94 * call special read code to wait for more input.
95 */
96 ANTLR3_UINT32 sizeBuf;
97
98 /** The line number we are traversing in the input file. This gets incremented
99 * by a newline() call in the lexer grammar actions.
100 */
101 ANTLR3_UINT32 line;
102
103 /** Pointer into the input buffer where the current line
104 * started.
105 */
106 void * currentLine;
107
108 /** The offset within the current line of the current character
109 */
110 ANTLR3_INT32 charPositionInLine;
111
112 /** Tracks how deep mark() calls are nested
113 */
114 ANTLR3_UINT32 markDepth;
115
116 /** List of mark() points in the input stream
117 */
118 pANTLR3_VECTOR markers;
119
120 /** File name string, set to pointer to memory if
121 * you set it manually as it will be free()d
122 */
123 pANTLR3_STRING fileName;
124
125 /** File number, needs to be set manually to some file index of your devising.
126 */
127 ANTLR3_UINT32 fileNo;
128
129 /* API */
130
131
132 /** Pointer to function that closes the input stream
133 */
134 void (*close) (struct ANTLR3_INPUT_STREAM_struct * input);
135 void (*free) (struct ANTLR3_INPUT_STREAM_struct * input);
136
137 /** Pointer to function that resets the input stream
138 */
139 void (*reset) (struct ANTLR3_INPUT_STREAM_struct * input);
140
141 /** Pointer to a function that reuses and resets an input stream by
142 * supplying a new 'source'
143 */
144 void (*reuse) (struct ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
145
146 /**
147 * Pointer to function that installs a version of LA that always
148 * returns upper case. Only valid for character streams and creates a case
149 * insensitive lexer if the lexer tokens are described in upper case. The
150 * tokens will preserve case in the token text.
151 */
152 void (*setUcaseLA) (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
153
154 /** Pointer to function to return input stream element at 1 based
155 * offset from nextChar. Same as _LA for char stream, but token
156 * streams etc. have one of these that does other stuff of course.
157 */
158 void * (*_LT) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt);
159
160 /** Pointer to function to return the total size of the input buffer. For streams
161 * this may be just the total we have available so far. This means of course that
162 * the input stream must be careful to accumulate enough input so that any backtracking
163 * can be satisfied.
164 */
165 ANTLR3_UINT32 (*size) (struct ANTLR3_INPUT_STREAM_struct * input);
166
167 /** Pointer to function to return a substring of the input stream. String is returned in allocated
168 * memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form.
169 */
170 pANTLR3_STRING (*substr) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
171
172 /** Pointer to function to return the current line number in the input stream
173 */
174 ANTLR3_UINT32 (*getLine) (struct ANTLR3_INPUT_STREAM_struct * input);
175
176 /** Pointer to function to return the current line buffer in the input stream
177 * The pointer returned is directly into the input stream so you must copy
178 * it if you wish to manipulate it without damaging the input stream. Encoding
179 * is obviously in the same form as the input stream.
180 * \remark
181 * - Note taht this function wil lbe inaccurate if setLine is called as there
182 * is no way at the moment to position the input stream at a particular line
183 * number offset.
184 */
185 void * (*getLineBuf) (struct ANTLR3_INPUT_STREAM_struct * input);
186
187 /** Pointer to function to return the current offset in the current input stream line
188 */
189 ANTLR3_UINT32 (*getCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input);
190
191 /** Pointer to function to set the current line number in the input stream
192 */
193 void (*setLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line);
194
195 /** Pointer to function to set the current position in the current line.
196 */
197 void (*setCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position);
198
199 /** Pointer to function to override the default newline character that the input stream
200 * looks for to trigger the line/offset and line buffer recording information.
201 * \remark
202 * - By default the chracter '\n' will be installed as the newline trigger character. When this
203 * character is seen by the consume() function then the current line number is incremented and the
204 * current line offset is reset to 0. The Pointer for the line of input we are consuming
205 * is updated to point to the next character after this one in the input stream (which means it
206 * may become invalid if the last newline character in the file is seen (so watch out).
207 * - If for some reason you do not want the counters and pointers to be restee, you can set the
208 * chracter to some impossible character such as '\0' or whatever.
209 * - This is a single character only, so choose the last character in a sequence of two or more.
210 * - This is only a simple aid to error reporting - if you have a complicated binary input structure
211 * it may not be adequate, but you can always override every function in the input stream with your
212 * own of course, and can even write your own complete input stream set if you like.
213 * - It is your responsiblity to set a valid character for the input stream type. There is no point
214 * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
215 * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
216 */
217 void (*SetNewLineChar) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar);
218
219 /// Character that automatically causes an internal line count
220 /// increment.
221 ///
222 ANTLR3_UCHAR newlineChar;
223
224 /// Indicates the size, in 8 bit units, of a single character. Note that
225 /// the C runtime does not deal with surrogates as this would be
226 /// slow and complicated. If this is a UTF-8 stream then this field
227 /// will be set to 0. Generally you are best working internally with 32 bit characters
228 /// as this is the most efficient.
229 ///
230 ANTLR3_UINT8 charByteSize;
231
232 /// Indicates the encoding scheme used in this input stream
233 ///
234 ANTLR3_UINT32 encoding;
235}
236
237 ANTLR3_INPUT_STREAM;
238
239
240/** \brief Structure for track lex input states as part of mark()
241 * and rewind() of lexer.
242 */
243typedef struct ANTLR3_LEX_STATE_struct
244{
245 /** Pointer to the next character to be consumed from the input data
246 * This is cast to point at the encoding of the original file that
247 * was read by the functions installed as pointer in this input stream
248 * context instance at file/string/whatever load time.
249 */
250 void * nextChar;
251
252 /** The line number we are traversing in the input file. This gets incremented
253 * by a newline() call in the lexer grammer actions.
254 */
255 ANTLR3_UINT32 line;
256
257 /** Pointer into the input buffer where the current line
258 * started.
259 */
260 void * currentLine;
261
262 /** The offset within the current line of the current character
263 */
264 ANTLR3_INT32 charPositionInLine;
265
266}
267 ANTLR3_LEX_STATE;
268
269/* Prototypes
270 */
271void antlr38BitSetupStream (pANTLR3_INPUT_STREAM input);
272void antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
273void antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
274void antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input);
275void antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input);
276void antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input);
277#ifdef __cplusplus
278}
279#endif
280
281#endif /* _ANTLR3_INPUT_H */