blob: 9639c658b1c2b34a89af5c5edcf1c6dc1b758cee [file] [log] [blame]
Guido van Rossuma3309961993-07-28 09:05:47 +00001#ifndef Py_TOKENIZER_H
2#define Py_TOKENIZER_H
3#ifdef __cplusplus
4extern "C" {
5#endif
6
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00007#include "object.h"
Guido van Rossumf70e43a1991-02-19 12:39:46 +00008
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009/* Tokenizer interface */
10
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000011#include "token.h" /* For token types */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000012
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000013#define MAXINDENT 100 /* Max indentation level */
Serhiy Storchaka94cf3082018-12-17 17:34:14 +020014#define MAXLEVEL 200 /* Max parentheses level */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000015
Neil Schemenauer3f993c32007-09-21 20:50:26 +000016enum decoding_state {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000017 STATE_INIT,
18 STATE_RAW,
Georg Brandl2b15bd82010-10-29 04:54:13 +000019 STATE_NORMAL /* have a codec associated with input */
Neil Schemenauer3f993c32007-09-21 20:50:26 +000020};
21
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000022/* Tokenizer state */
23struct tok_state {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000024 /* Input state; buf <= cur <= inp <= end */
25 /* NB an entire line is held in the buffer */
26 char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */
27 char *cur; /* Next character in buffer */
28 char *inp; /* End of data in buffer */
29 char *end; /* End of input buffer if buf != NULL */
30 char *start; /* Start of current token if not NULL */
31 int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
32 /* NB If done != E_OK, cur must be == inp!!! */
33 FILE *fp; /* Rest of input; NULL if tokenizing a string */
34 int tabsize; /* Tab spacing */
35 int indent; /* Current indentation index */
36 int indstack[MAXINDENT]; /* Stack of indents */
37 int atbol; /* Nonzero if at begin of new line */
38 int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
Serhiy Storchakac6792272013-10-19 21:03:34 +030039 const char *prompt, *nextprompt; /* For interactive prompting */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000040 int lineno; /* Current line number */
Anthony Sottile995d9b92019-01-12 20:05:13 -080041 int first_lineno; /* First line of a single line or multi line string
42 expression (cf. issue 16806) */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 int level; /* () [] {} Parentheses nesting level */
44 /* Used to allow free continuations inside them */
Victor Stinner7f2fee32011-04-05 00:39:01 +020045#ifndef PGEN
Serhiy Storchaka94cf3082018-12-17 17:34:14 +020046 char parenstack[MAXLEVEL];
47 int parenlinenostack[MAXLEVEL];
Victor Stinner7f2fee32011-04-05 00:39:01 +020048 /* pgen doesn't have access to Python codecs, it cannot decode the input
49 filename. The bytes filename might be kept, but it is only used by
50 indenterror() and it is not really needed: pgen only compiles one file
51 (Grammar/Grammar). */
52 PyObject *filename;
53#endif
Serhiy Storchaka94cf3082018-12-17 17:34:14 +020054 /* Stuff for checking on different tab sizes */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000055 int altindstack[MAXINDENT]; /* Stack of alternate indents */
56 /* Stuff for PEP 0263 */
57 enum decoding_state decoding_state;
58 int decoding_erred; /* whether erred in decoding */
59 int read_coding_spec; /* whether 'coding:...' has been read */
60 char *encoding; /* Source encoding. */
61 int cont_line; /* whether we are in a continuation line. */
62 const char* line_start; /* pointer to start of current line */
Anthony Sottile995d9b92019-01-12 20:05:13 -080063 const char* multi_line_start; /* pointer to start of first line of
64 a single line or multi line string
65 expression (cf. issue 16806) */
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000066#ifndef PGEN
Victor Stinner22a351a2010-10-14 12:04:34 +000067 PyObject *decoding_readline; /* open(...).readline */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000068 PyObject *decoding_buffer;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +000069#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 const char* enc; /* Encoding for the current str. */
71 const char* str;
72 const char* input; /* Tokenizer's newline translated copy of the string. */
Guido van Rossumdcfcd142019-01-31 03:40:27 -080073
74 int type_comments; /* Whether to look for type comments */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000075};
76
Benjamin Petersonaeaa5922009-11-13 00:17:59 +000077extern struct tok_state *PyTokenizer_FromString(const char *, int);
78extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
Serhiy Storchakac6792272013-10-19 21:03:34 +030079extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
80 const char *, const char *);
Tim Petersdbd9ba62000-07-09 03:09:57 +000081extern void PyTokenizer_Free(struct tok_state *);
82extern int PyTokenizer_Get(struct tok_state *, char **, char **);
Guido van Rossuma3309961993-07-28 09:05:47 +000083
84#ifdef __cplusplus
85}
86#endif
87#endif /* !Py_TOKENIZER_H */