-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.h
232 lines (218 loc) · 7.03 KB
/
lexer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/**
* @copyright
* Copyright (c) 2016-2021 Stanislav Ivochkin
*
* Licensed under the MIT License (see LICENSE)
*/
#ifndef EMBEDJSON_AMALGAMATE
#pragma once
#include "common.h"
#endif /* EMBEDJSON_AMALGAMATE */
/**
* JSON lexer. Transforms input stream of bytes into a stream
* of tokens defined by JSON grammar.
*
* For instance, a string {[:"foo"10 will be transformed into
* a series "open curly bracket", "open bracket", "colon",
* "string 'foo'", "integer 10".
*
* Lexer does not take into consideration meaning of the tokens,
* so a string "{{{{" will be successfully handled. Syntax checks
* are performed by a higher-level abstraction - parser.
*
* @note JSON strings are not accumulated by the lexer - only user
* provided buffers are used to provide string values to the caller.
* That's why each JSON string value is transformed into a possibly
* empty series of embedjson_tokenc calls.
*
* A new string chunk is created each time one of the following events occurs:
* - a buffer provided for embedjson_lexer_push function is parsed
* to the end, while lexer is in the LEXER_STATE_IN_STRING state;
* - ASCII escape sequence is found in the string;
* - Unicode escape sequence is found in the string.
*
* For the user's convenience, two supplementary methods that wrap a sequence of
* embedjson_tokenc calls are invoked by the lexer during parsing:
* - embedjson_tokenc_begin
* - embedjson_tokenc_end
*/
typedef struct embedjson_lexer {
unsigned char state;
unsigned char offset;
char unicode_cp[2];
unsigned char encoding : 3;
unsigned char magic_bytes_read : 3;
char minus : 1;
char exp_minus : 1;
char exp_not_empty : 1;
/**
* Magic sequence for encoding guessing
*/
union {
char as_char[4];
int as_int;
} magic;
embedjson_int_t int_value;
unsigned long long frac_value;
unsigned short frac_power;
unsigned short exp_value;
#if EMBEDJSON_VALIDATE_UTF8
/**
* Number of bytes remaining to complete multibyte UTF-8 sequence
*/
unsigned char nb;
/**
* Corner cases for shortest possible UTF-8 encoding issue.
*
* See http://www.unicode.org/versions/corrigendum1.html for detailed
* explanation of the issue and provided solution.
*
* Possible values are:
*
* @li 1 - for code points U+0800..U+0FFF. For these code points three bytes
* are needed for encoding. If the first byte value is \xe0 (11100000), then
* allowed values for the second byte are not \x80..\xbf, but \xa0..\xbf.
*
* @li 2 - for code points U+10000..U+3FFFF. For these code points four bytes
* are needed for encoding. If the first byte value is \xf0 (11110000), then
* allowed values for the second byte are not \x80..\xbf, but \x90..\xbf.
*
* @li 3 - for code points U+100000..U+10FFFF. If the first byte value
* is \xf4, then allowed values for the second byte are not \x80..\xbf,
* but \x80..\x8f.
*/
unsigned char cc;
#endif
} embedjson_lexer;
/**
* JSON token type
*/
typedef enum {
EMBEDJSON_TOKEN_OPEN_CURLY_BRACKET,
EMBEDJSON_TOKEN_CLOSE_CURLY_BRACKET,
EMBEDJSON_TOKEN_OPEN_BRACKET,
EMBEDJSON_TOKEN_CLOSE_BRACKET,
EMBEDJSON_TOKEN_COMMA,
EMBEDJSON_TOKEN_COLON,
EMBEDJSON_TOKEN_TRUE,
EMBEDJSON_TOKEN_FALSE,
EMBEDJSON_TOKEN_NULL
} embedjson_tok;
/**
* Called by embedjson_push for each data chunk to parse.
*
* Results are returned by calling a family of embedjson_token*
* functions:
* - embedjson_token
* - embedjson_tokenc
* - embedjson_tokenc_begin
* - embedjson_tokenc_end
* - embedjson_tokenf
* - embedjson_tokeni
*
* Errors that occurs during parsing are returned via embedjson_error call.
*
* @note If error occurs, lexer state remain unchanged
*/
EMBEDJSON_STATIC int embedjson_lexer_push(embedjson_lexer* lexer,
const char* data, embedjson_size_t size);
/**
* Called by embedjson_finalize to indicate that all data has been submitted to
* lexer.
*
* Results are returned as in the embedjson_lexer_push function.
*/
EMBEDJSON_STATIC int embedjson_lexer_finalize(embedjson_lexer* lexer);
/**
* Called from embedjson_lexer_push for each successfully parsed any token
* that does not have a value.
*
* A list of possibly returned token types:
* - EMBEDJSON_TOKEN_OPEN_CURLY_BRACKET,
* - EMBEDJSON_TOKEN_CLOSE_CURLY_BRACKET,
* - EMBEDJSON_TOKEN_OPEN_BRACKET,
* - EMBEDJSON_TOKEN_CLOSE_BRACKET,
* - EMBEDJSON_TOKEN_COMMA,
* - EMBEDJSON_TOKEN_COLON,
* - EMBEDJSON_TOKEN_TRUE,
* - EMBEDJSON_TOKEN_FALSE,
* - EMBEDJSON_TOKEN_NULL
*/
EMBEDJSON_STATIC int embedjson_token(embedjson_lexer* lexer,
embedjson_tok token, const char* position);
/**
* Called from embedjson_lexer_push for each successfully parsed
* string chunk.
*
* A pointer to buffer that contains string chunk data and it's size are
* provided to the callback
*
* @see embedjson_tokenc_begin, embedjson_tokenc_end
*/
EMBEDJSON_STATIC int embedjson_tokenc(embedjson_lexer* lexer, const char* data,
embedjson_size_t size);
/**
* Called from embedjson_lexer_push for each successfully parsed
* integer value.
*
* @see embedjson_tokenf
*/
EMBEDJSON_STATIC int embedjson_tokeni(embedjson_lexer* lexer, embedjson_int_t value,
const char* position);
/**
* Called from embedjson_lexer_push for each successfully parsed
* floating-point value.
*
* @see embedjson_tokeni
*/
EMBEDJSON_STATIC int embedjson_tokenf(embedjson_lexer* lexer, double value,
const char* position);
/**
* Called from embedjson_lexer_push when a beginning of the string token is
* spotted.
*
* @see embedjson_tokenc, embedjson_tokenc_end
*/
EMBEDJSON_STATIC int embedjson_tokenc_begin(embedjson_lexer* lexer,
const char* position);
/**
* Called from embedjson_lexer_push when string parsing is complete.
*
* From the user's perspective, a sequence of embedjson_tokenc calls
* will always end with a single embedjson_tokenc_end call.
* The call indicate that all chunks of the string were parsed.
*
* @see embedjson_tokenc, embedjson_tokenc_begin
*/
EMBEDJSON_STATIC int embedjson_tokenc_end(embedjson_lexer* lexer,
const char* position);
#if EMBEDJSON_BIGNUM
/**
* Called from embedjson_lexer_push when a beginning of the big number
* token is spotted.
*
* @see embedjson_tokenbn, embedjson_tokenbn_end
*/
EMBEDJSON_STATIC int embedjson_tokenbn_begin(embedjson_lexer* lexer,
const char* position, embedjson_int_t initial_value);
/**
* Called from embedjson_lexer_push for each successfully parsed
* big number chunk.
*
* A pointer to buffer that contains big number chunk data and it's size are
* provided to the callback
*/
EMBEDJSON_STATIC int embedjson_tokenbn(embedjson_lexer* lexer, const char* data,
embedjson_size_t size);
/**
* Called from embedjson_lexer_push when big number parsing is complete.
*
* From the user's perspective, a sequence of embedjson_tokenbn calls
* will always end with a single embedjson_tokenbn_end call.
* The call indicate that all chunks of the big number were parsed.
*
* @see embedjson_tokenbn, embedjson_tokenbn_begin
*/
EMBEDJSON_STATIC int embedjson_tokenbn_end(embedjson_lexer* lexer,
const char* position);
#endif /* EMBEDJSON_BIGNUM */