Cozis

String Parsing Patterns

Over the years, I came up with several patterns for parsing strings without a tokenization step. I found that following this patterns consciously made it easy to write robust string parsing code.

In this post I'll list them and show how to use them to make a simplified JSON parser. Code snippets will use the C language, but the logic applies to any C-like language.

General Structure

Lets assume char *src is a string of length int len we want to extract information from. We will use the int cur cursor variable to keep track of how many characters we already inspected from the string. The cursor is such that src[cur] is the next character that hasn't been inspected yet. If cur == len, all characters have been inspected. Note that I sometimes refer to inspecting characters as "skipping" or "consuming" them.

Consuming an Optional Sequence

To move the cursor over a sequence of characters, you write a loop like this:

1while (cur < len && test(src[cur]))
2 cur++;
3

the sequence is defined by the test function. All and only characters which pass the test are consumed. If no characters pass the test, the cursor is left unchanged. This is why I call this "consuming an optional sequence".

Consuming a Required Sequence

What if we want to consume a non optional sequence? In other words, the sequence must contain at least one character. The way this is done is by adding a first if statement:

1if (cur == len || !test(src[cur]))
2 return; // Error
3do
4 cur++;
5while (cur < len && test(src[cur]));
6

we added an if statement which checks the negated version of the loop's condition. If the if's check passes, we know there is at least one character, and therefore can use a do..while instead of a regular while loop.

Consuming a Specific Character

We can consume an optional character like this:

1if (cur < len && src[cur] == 'X')
2 cur++;
3

and we can consume a required character like this:

1if (cur == len || src[cur] != 'X')
2 return; // Error
3cur++;
4

Consuming a Specific String

To consume a specific string, we can of course consume the single characters:

1if (cur == len || src[cur] != 'A')
2 return; // Error
3cur++;
4
5if (cur == len || src[cur] != 'B')
6 return; // Error
7cur++;
8
9if (cur == len || src[cur] != 'C')
10 return; // Error
11cur++;
12

but we can do better:

1if (2 >= len - cur
2 || src[cur+0] != 'A'
3 || src[cur+1] != 'B'
4 || src[cur+2] != 'C')
5 return; // Error
6cur += 3;
7

This way we perform a single bounds check and it's more compact overall. This isn't very intuitive for me either, so this is what I think when writing this.

First, I ignore the string's bounds and check the characters:

1if (
2 || src[cur+0] != 'A'
3 || src[cur+1] != 'B'
4 || src[cur+2] != 'C')
5 return; // Error
6cur += 3;
7

the index furthest away from the cursor is cur+2. If this offset is out of bounds, they all are. Therefore the checks are valid only if cur+2 < len, and they are surely not valid when cur+2 >= len. We add this to our expression:

1if (cur+2 >= len
2 || src[cur+0] != 'A'
3 || src[cur+1] != 'B'
4 || src[cur+2] != 'C')
5 return; // Error
6cur += 3;
7

if we want to be extra careful here we can improve the bounds check to avoid overflow

1if (2 >= len - cur
2 || src[cur+0] != 'A'
3 || src[cur+1] != 'B'
4 || src[cur+2] != 'C')
5 return; // Error
6cur += 3;
7

this is because len - cur can never underflow, while cur+2 may overflow. This is very nitpicky though.

Parsing Strings

What if we want to not only consume characters, but extract information from the source string? Fear not! We can easily use our previous lessons for this. Lets say we want to parse a string in the form <spaces> <name> <spaces> <nickname>:

1int off;
2
3while (cur < len && is_whitespace(src[cur]))
4 cur++;
5
6off = cur;
7while (cur < len && !is_whitespace(src[cur]))
8 cur++;
9char *name_ptr = src + off;
10int name_len = cur - off;
11
12while (cur < len && is_whitespace(src[cur]))
13 cur++;
14
15off = cur;
16while (cur < len && !is_whitespace(src[cur]))
17 cur++;
18char *nick_ptr = src + off;
19int nick_len = cur - off;
20
21while (cur < len && is_whitespace(src[cur]))
22 cur++;
23

with this, we end up with pointers to name and nickname and their relative lengths, with no memory allocation! Note that the strings are not null-terminated. If you want to print them with printf, you'll have to use this format specifier:

1printf("Hello, I'm %.*s and my nickname is %.*s\n", name_len, name_ptr, nick_len, nick_ptr);
2

but note that using !is_whitespace(src[cur]) (with the negation) is inherently unsafe. The robust solution is to always specify which characters are allowed, not the other way around. The robust version should use something like this instead of just allowing non-whitespace characters:

1char is_name_or_nick(char c)
2{
3 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
4}
5

Parsing Integers

This is how we can scan a sequence of digits and build the equivalent integer representation of it:

1#include <limits.h> // Defines INT_MAX
2
3typedef struct {
4 char *src;
5 int len;
6 int cur;
7} Scanner;
8
9int is_digit(char c)
10{
11 return c >= '0' && c <= '9';
12}
13
14int parse_int(Scanner *s, int *out)
15{
16 if (s->cur == s->len || !is_digit(s->src[s->cur]))
17 return 0; // Error
18
19 int x = 0;
20 do {
21 int d = s->src[s->cur++] - '0';
22 if (x > (INT_MAX - d) / 10)
23 return 0; // Overflow
24 x = x * 10 + d;
25 } while (s->cur < s->len && is_digit(s->src[s->cur]));
26
27 *out = x;
28 return 1;
29}
30

Parsing Floats

This is how one can parse floats:

1typedef struct {
2 char *src;
3 int len;
4 int cur;
5} Scanner;
6
7int is_digit(char c)
8{
9 return c >= '0' && c <= '9';
10}
11
12int parse_float(Scanner *s, float *out)
13{
14 if (s->cur == s->len || !is_digit(s->src[s->cur]))
15 return 0; // Error
16
17 float x = 0;
18 do {
19 int d = s->src[s->cur++] - '0';
20 x = x * 10 + d;
21 } while (s->cur < s->len && is_digit(s->src[s->cur]));
22
23 if (s->cur == s->len || s->src[s->cur] != '.')
24 return 0; // Error
25 s->cur++;
26
27 if (s->cur == s->len || !is_digit(s->src[s->cur]))
28 return 0; // Error
29 float q = 1;
30 do {
31 q /= 10;
32 x += q * (s->src[s->cur++] - '0');
33 } while (s->cur < s->len && is_digit(s->src[s->cur]));
34
35 *out = x;
36 return 1;
37}
38

this code looks quite involved, but actually it's just the same pattern applied over and over.

Example

This is an example JSON parser written using the previous patterns. For simplicity I just output the parsed values instead of building an in-memory representation.

1#define ASSERT(X) {if (!(X)) __builtin_trap();}
2
3typedef struct {
4 char *src;
5 int len;
6 int cur;
7} Scanner;
8
9int is_digit(char c)
10{
11 return c >= '0' && c <= '9';
12}
13
14int is_whitespace(char c)
15{
16 return c == ' ' || c == '\t' || c == '\r' || c == '\n';
17}
18
19int parse_string(Scanner *s)
20{
21 ASSERT(s->cur < s->len && s->src[s->cur] == '"');
22 s->cur++;
23
24 int off = s->cur;
25 while (s->cur < s->len && s->src[s->cur] != '"')
26 s->cur++;
27 char *p = s->src + off;
28 int l = s->cur - off;
29
30 if (s->cur == s->len)
31 return 0; // Error
32 s->cur++;
33
34 printf("Parsed string \"%.*s\"\n", l, p);
35
36 return 1;
37}
38
39int parse_object(Scanner *s)
40{
41 ASSERT(s->cur < s->len && s->src[s->cur] == '{');
42 s->cur++;
43
44 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
45 s->cur++;
46
47 if (s->cur < s->len && s->src[s->cur] != '}') {
48
49 for (;;) {
50
51 if (!parse_string(s))
52 return 0; // Propagate error
53
54 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
55 s->cur++;
56
57 if (s->cur == s->len || s->src[s->cur] != ':')
58 return 0; // Error
59 s->cur++;
60
61 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
62 s->cur++;
63
64 if (!parse_value(s))
65 return 0; // Propagate error
66
67 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
68 s->cur++;
69
70 if (s->cur < s->len && s->src[s->cur] == '}')
71 break;
72
73 if (s->cur == s->len || s->src[s->cur] != ',')
74 return 0; // Error
75 s->cur++;
76
77 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
78 s->cur++;
79 }
80 }
81 ASSERT(s->cur < s->len && s->src[s->cur] == '}');
82 s->cur++;
83 return 1;
84}
85
86int parse_array(Scanner *s)
87{
88 ASSERT(s->cur < s->len && s->src[s->cur] == '[');
89 s->cur++;
90
91 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
92 s->cur++;
93
94 if (s->cur < s->len && s->src[s->cur] != ']') {
95
96 for (;;) {
97
98 if (!parse_value(s))
99 return 0; // Propagate error
100
101 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
102 s->cur++;
103
104 if (s->cur < s->len && s->src[s->cur] == ']')
105 break;
106
107 if (s->cur == s->len || s->src[s->cur] != ',')
108 return 0; // Error
109 s->cur++;
110
111 while (s->cur < s->len && is_whitespace(s->src[s->cur]))
112 s->cur++;
113 }
114 }
115 ASSERT(s->cur < s->len && s->src[s->cur] == ']');
116 s->cur++;
117 return 1;
118}
119
120int parse_int(Scanner *s)
121{
122 if (s->cur == s->len || !is_digit(s->src[s->cur]))
123 return 0; // Error
124
125 int x = 0;
126 do {
127 int d = s->src[s->cur++] - '0';
128 if (x > (INT_MAX - d) / 10)
129 return 0; // Overflow
130 x = x * 10 + d;
131 } while (s->cur < s->len && is_digit(s->src[s->cur]));
132
133 printf("Parsed integer %d\n", x);
134 return 1;
135}
136
137int parse_float(Scanner *s)
138{
139 if (s->cur == s->len || !is_digit(s->src[s->cur]))
140 return 0; // Error
141
142 float x = 0;
143 do {
144 int d = s->src[s->cur++] - '0';
145 x = x * 10 + d;
146 // Note that parse_number already made sure
147 // the sequence of digits terminated with a dot,
148 // so we can omit the cur<len check
149 } while (s->src[s->cur] != '.');
150
151 if (s->cur == s->len || s->src[s->cur] != '.')
152 return 0; // Error
153 s->cur++;
154
155 if (s->cur == s->len || !is_digit(s->src[s->cur]))
156 return 0; // Error
157 float q = 1;
158 do {
159 q /= 10;
160 x += q * (s->src[s->cur++] - '0');
161 } while (s->cur < s->len && is_digit(s->src[s->cur]));
162
163 printf("Parsed float %f\n", x);
164 return 1;
165}
166
167int parse_number(Scanner *s)
168{
169 int peek = s->cur;
170 while (peek < s->len && is_digit(s->src[peek]))
171 peek++;
172
173 if (peek == s->len || s->src[peek] != '.')
174 return parse_int(s);
175
176 return parse_float(s);
177}
178
179// Parses any type of value. The cursor must point to the first
180// character of the value. Any preceding whitespace must be
181// consume by the caller.
182int parse_value(Scanner *s)
183{
184 if (s->cur == s->len)
185 return 0; // Error
186
187 if (s->src[s->cur] == '"')
188 return parse_string(s);
189
190 if (s->src[s->cur] == '{')
191 return parse_object(s);
192
193 if (s->src[s->cur] == '[')
194 return parse_array(s);
195
196 if (is_digit(s->src[s->cur]))
197 return parse_number(s);
198
199 if (3 < s->len - s->cur
200 && s->src[s->cur+0] == 't'
201 && s->src[s->cur+1] == 'r'
202 && s->src[s->cur+2] == 'u'
203 && s->src[s->cur+3] == 'e') {
204 s->cur += 4;
205 printf("Parsed true\n");
206 return 1;
207 }
208
209 if (3 < s->len - s->cur
210 && s->src[s->cur+0] == 'n'
211 && s->src[s->cur+1] == 'u'
212 && s->src[s->cur+2] == 'l'
213 && s->src[s->cur+3] == 'l') {
214 s->cur += 4;
215 printf("Parsed null\n");
216 return 1;
217 }
218
219 if (4 < s->len - s->cur
220 && s->src[s->cur+0] == 'f'
221 && s->src[s->cur+1] == 'a'
222 && s->src[s->cur+2] == 'l'
223 && s->src[s->cur+3] == 's'
224 && s->src[s->cur+4] == 'e') {
225 s->cur += 5;
226 printf("Parsed false\n");
227 return 1;
228 }
229
230 // Invalid character
231 return 0;
232}
233
234// Returns 0 on error, 1 on success
235int parse_json(char *src, int len)
236{
237 Scanner s = {src, len, 0};
238
239 while (s.cur < s.len && is_whitespace(s.src[s.cur]))
240 s.cur++;
241
242 if (!parse_value(&s))
243 return 0;
244
245 if (s.cur == s.len)
246 return 0;
247
248 return 1;
249}
250