blob: af053e250a3e2b6afccf786bd68e4e972a5972a0 [file] [log] [blame]
Guido van Rossuma3309961993-07-28 09:05:47 +00001#ifndef Py_TOKENIZER_H
2#define Py_TOKENIZER_H
3#ifdef __cplusplus
4extern "C" {
5#endif
6
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00007#include "object.h"
Guido van Rossumf70e43a1991-02-19 12:39:46 +00008
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009/* Tokenizer interface */
10
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000011#include "token.h" /* For token types */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000012
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000013#define MAXINDENT 100 /* Max indentation level */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000014
Neil Schemenauer3f993c32007-09-21 20:50:26 +000015enum decoding_state {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000016 STATE_INIT,
17 STATE_RAW,
Georg Brandl2b15bd82010-10-29 04:54:13 +000018 STATE_NORMAL /* have a codec associated with input */
Neil Schemenauer3f993c32007-09-21 20:50:26 +000019};
20
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000021/* Tokenizer state */
22struct tok_state {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 /* Input state; buf <= cur <= inp <= end */
24 /* NB an entire line is held in the buffer */
25 char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */
26 char *cur; /* Next character in buffer */
27 char *inp; /* End of data in buffer */
28 char *end; /* End of input buffer if buf != NULL */
29 char *start; /* Start of current token if not NULL */
30 int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
31 /* NB If done != E_OK, cur must be == inp!!! */
32 FILE *fp; /* Rest of input; NULL if tokenizing a string */
33 int tabsize; /* Tab spacing */
34 int indent; /* Current indentation index */
35 int indstack[MAXINDENT]; /* Stack of indents */
36 int atbol; /* Nonzero if at begin of new line */
37 int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
Serhiy Storchakac6792272013-10-19 21:03:34 +030038 const char *prompt, *nextprompt; /* For interactive prompting */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 int lineno; /* Current line number */
40 int level; /* () [] {} Parentheses nesting level */
41 /* Used to allow free continuations inside them */
42 /* Stuff for checking on different tab sizes */
Victor Stinner7f2fee32011-04-05 00:39:01 +020043#ifndef PGEN
44 /* pgen doesn't have access to Python codecs, it cannot decode the input
45 filename. The bytes filename might be kept, but it is only used by
46 indenterror() and it is not really needed: pgen only compiles one file
47 (Grammar/Grammar). */
48 PyObject *filename;
49#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000050 int altwarning; /* Issue warning if alternate tabs don't match */
51 int alterror; /* Issue error if alternate tabs don't match */
52 int alttabsize; /* Alternate tab spacing */
53 int altindstack[MAXINDENT]; /* Stack of alternate indents */
54 /* Stuff for PEP 0263 */
55 enum decoding_state decoding_state;
56 int decoding_erred; /* whether erred in decoding */
57 int read_coding_spec; /* whether 'coding:...' has been read */
58 char *encoding; /* Source encoding. */
59 int cont_line; /* whether we are in a continuation line. */
60 const char* line_start; /* pointer to start of current line */
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000061#ifndef PGEN
Victor Stinner22a351a2010-10-14 12:04:34 +000062 PyObject *decoding_readline; /* open(...).readline */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 PyObject *decoding_buffer;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000064#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065 const char* enc; /* Encoding for the current str. */
66 const char* str;
67 const char* input; /* Tokenizer's newline translated copy of the string. */
Yury Selivanov75445082015-05-11 22:57:16 -040068
Yury Selivanov96ec9342015-07-23 15:01:58 +030069 /* async/await related fields; can be removed in 3.7 when async and await
70 become normal keywords. */
71 int async_def; /* =1 if tokens are inside an 'async def' body. */
72 int async_def_indent; /* Indentation level of the outermost 'async def'. */
73 int async_def_nl; /* =1 if the outermost 'async def' had at least one
74 NEWLINE token after it. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000075};
76
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000077extern struct tok_state *PyTokenizer_FromString(const char *, int);
78extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
Serhiy Storchakac6792272013-10-19 21:03:34 +030079extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
80 const char *, const char *);
Tim Petersdbd9ba62000-07-09 03:09:57 +000081extern void PyTokenizer_Free(struct tok_state *);
82extern int PyTokenizer_Get(struct tok_state *, char **, char **);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
84 int len, int *offset);
Guido van Rossuma3309961993-07-28 09:05:47 +000085
86#ifdef __cplusplus
87}
88#endif
89#endif /* !Py_TOKENIZER_H */