]>
Commit | Line | Data |
---|---|---|
1 | /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc. | |
2 | * | |
3 | * This program is free software; you can redistribute it and/or modify | |
4 | * it under the terms of the GNU Library General Public License as published by | |
5 | * the Free Software Foundation; either version 2, or (at your option) | |
6 | * any later version. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 | * GNU Library General Public License for more details. | |
12 | * | |
13 | * You should have received a copy of the GNU Library General Public License | |
14 | * along with this software; see the file COPYING. If not, write to | |
15 | * the Free Software Foundation, 59 Temple Place - Suite 330, | |
16 | * Boston, MA 02111-1307, USA. | |
17 | */ | |
18 | ||
19 | \f | |
20 | #include <sys/types.h> | |
21 | #include "rxall.h" | |
22 | #include "rxgnucomp.h" | |
23 | #include "inst-rxposix.h" | |
24 | ||
25 | \f | |
26 | /* {A Syntax Table} | |
27 | */ | |
28 | ||
29 | /* Define the syntax basics for \<, \>, etc. | |
30 | */ | |
31 | ||
32 | #ifndef emacs | |
33 | #define CHARBITS 8 | |
34 | #define CHAR_SET_SIZE (1 << CHARBITS) | |
35 | #define Sword 1 | |
36 | #define SYNTAX(c) re_syntax_table[c] | |
37 | char re_syntax_table[CHAR_SET_SIZE]; | |
38 | ||
39 | #ifdef __STDC__ | |
40 | static void | |
41 | init_syntax_once (void) | |
42 | #else | |
43 | static void | |
44 | init_syntax_once () | |
45 | #endif | |
46 | { | |
47 | register int c; | |
48 | static int done = 0; | |
49 | ||
50 | if (done) | |
51 | return; | |
52 | ||
53 | rx_bzero ((char *)re_syntax_table, sizeof re_syntax_table); | |
54 | ||
55 | for (c = 'a'; c <= 'z'; c++) | |
56 | re_syntax_table[c] = Sword; | |
57 | ||
58 | for (c = 'A'; c <= 'Z'; c++) | |
59 | re_syntax_table[c] = Sword; | |
60 | ||
61 | for (c = '0'; c <= '9'; c++) | |
62 | re_syntax_table[c] = Sword; | |
63 | ||
64 | re_syntax_table['_'] = Sword; | |
65 | ||
66 | done = 1; | |
67 | } | |
68 | #endif /* not emacs */ | |
69 | ||
70 | \f | |
71 | ||
72 | ||
73 | ||
74 | const char *rx_error_msg[] = | |
75 | { | |
76 | 0, /* REG_NOUT */ | |
77 | "No match", /* REG_NOMATCH */ | |
78 | "Invalid regular expression", /* REG_BADPAT */ | |
79 | "Invalid collation character", /* REG_ECOLLATE */ | |
80 | "Invalid character class name", /* REG_ECTYPE */ | |
81 | "Trailing backslash", /* REG_EESCAPE */ | |
82 | "Invalid back reference", /* REG_ESUBREG */ | |
83 | "Unmatched [ or [^", /* REG_EBRACK */ | |
84 | "Unmatched ( or \\(", /* REG_EPAREN */ | |
85 | "Unmatched \\{", /* REG_EBRACE */ | |
86 | "Invalid content of \\{\\}", /* REG_BADBR */ | |
87 | "Invalid range end", /* REG_ERANGE */ | |
88 | "Memory exhausted", /* REG_ESPACE */ | |
89 | "Invalid preceding regular expression", /* REG_BADRPT */ | |
90 | "Premature end of regular expression", /* REG_EEND */ | |
91 | "Regular expression too big", /* REG_ESIZE */ | |
92 | "Unmatched ) or \\)", /* REG_ERPAREN */ | |
93 | }; | |
94 | ||
95 | ||
96 | \f | |
97 | /* | |
98 | * Macros used while compiling patterns. | |
99 | * | |
100 | * By convention, PEND points just past the end of the uncompiled pattern, | |
101 | * P points to the read position in the pattern. `translate' is the name | |
102 | * of the translation table (`TRANSLATE' is the name of a macro that looks | |
103 | * things up in `translate'). | |
104 | */ | |
105 | ||
106 | ||
107 | /* | |
108 | * Fetch the next character in the uncompiled pattern---translating it | |
109 | * if necessary. *Also cast from a signed character in the constant | |
110 | * string passed to us by the user to an unsigned char that we can use | |
111 | * as an array index (in, e.g., `translate'). | |
112 | */ | |
113 | #define PATFETCH(c) \ | |
114 | do {if (p == pend) return REG_EEND; \ | |
115 | c = (unsigned char) *p++; \ | |
116 | c = translate[c]; \ | |
117 | } while (0) | |
118 | ||
119 | /* | |
120 | * Fetch the next character in the uncompiled pattern, with no | |
121 | * translation. | |
122 | */ | |
123 | #define PATFETCH_RAW(c) \ | |
124 | do {if (p == pend) return REG_EEND; \ | |
125 | c = (unsigned char) *p++; \ | |
126 | } while (0) | |
127 | ||
128 | /* Go backwards one character in the pattern. */ | |
129 | #define PATUNFETCH p-- | |
130 | ||
131 | ||
132 | #define TRANSLATE(d) translate[(unsigned char) (d)] | |
133 | ||
134 | typedef int regnum_t; | |
135 | ||
136 | /* Since offsets can go either forwards or backwards, this type needs to | |
137 | * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. | |
138 | */ | |
139 | typedef int pattern_offset_t; | |
140 | ||
141 | typedef struct | |
142 | { | |
143 | struct rexp_node ** top_expression; | |
144 | struct rexp_node ** last_expression; | |
145 | struct rexp_node ** last_non_regular_expression; | |
146 | pattern_offset_t inner_group_offset; | |
147 | regnum_t regnum; | |
148 | } compile_stack_elt_t; | |
149 | ||
150 | typedef struct | |
151 | { | |
152 | compile_stack_elt_t *stack; | |
153 | unsigned size; | |
154 | unsigned avail; /* Offset of next open position. */ | |
155 | } compile_stack_type; | |
156 | ||
157 | ||
158 | #define INIT_COMPILE_STACK_SIZE 32 | |
159 | ||
160 | #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
161 | #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
162 | ||
163 | /* The next available element. */ | |
164 | #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
165 | ||
166 | ||
167 | /* Set the bit for character C in a list. */ | |
168 | #define SET_LIST_BIT(c) \ | |
169 | (b[((unsigned char) (c)) / CHARBITS] \ | |
170 | |= 1 << (((unsigned char) c) % CHARBITS)) | |
171 | ||
172 | /* Get the next unsigned number in the uncompiled pattern. */ | |
173 | #define GET_UNSIGNED_NUMBER(num) \ | |
174 | { if (p != pend) \ | |
175 | { \ | |
176 | PATFETCH (c); \ | |
177 | while (isdigit (c)) \ | |
178 | { \ | |
179 | if (num < 0) \ | |
180 | num = 0; \ | |
181 | num = num * 10 + c - '0'; \ | |
182 | if (p == pend) \ | |
183 | break; \ | |
184 | PATFETCH (c); \ | |
185 | } \ | |
186 | } \ | |
187 | } | |
188 | ||
189 | #define CHAR_CLASS_MAX_LENGTH 64 | |
190 | ||
191 | #define IS_CHAR_CLASS(string) \ | |
192 | (!strcmp (string, "alpha") || !strcmp (string, "upper") \ | |
193 | || !strcmp (string, "lower") || !strcmp (string, "digit") \ | |
194 | || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \ | |
195 | || !strcmp (string, "space") || !strcmp (string, "print") \ | |
196 | || !strcmp (string, "punct") || !strcmp (string, "graph") \ | |
197 | || !strcmp (string, "cntrl") || !strcmp (string, "blank")) | |
198 | ||
199 | \f | |
200 | /* These predicates are used in regex_compile. */ | |
201 | ||
202 | /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
203 | * after an alternative or a begin-subexpression. We assume there is at | |
204 | * least one character before the ^. | |
205 | */ | |
206 | ||
207 | #ifdef __STDC__ | |
208 | static int | |
209 | at_begline_loc_p (const char *pattern, const char * p, unsigned long syntax) | |
210 | #else | |
211 | static int | |
212 | at_begline_loc_p (pattern, p, syntax) | |
213 | const char *pattern; | |
214 | const char * p; | |
215 | unsigned long syntax; | |
216 | #endif | |
217 | { | |
218 | const char *prev = p - 2; | |
219 | int prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\')); | |
220 | ||
221 | return | |
222 | ||
223 | (/* After a subexpression? */ | |
224 | ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash)) | |
225 | || | |
226 | /* After an alternative? */ | |
227 | ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash)) | |
228 | ); | |
229 | } | |
230 | ||
231 | /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
232 | * at least one character after the $, i.e., `P < PEND'. | |
233 | */ | |
234 | ||
235 | #ifdef __STDC__ | |
236 | static int | |
237 | at_endline_loc_p (const char *p, const char *pend, int syntax) | |
238 | #else | |
239 | static int | |
240 | at_endline_loc_p (p, pend, syntax) | |
241 | const char *p; | |
242 | const char *pend; | |
243 | int syntax; | |
244 | #endif | |
245 | { | |
246 | const char *next = p; | |
247 | int next_backslash = (*next == '\\'); | |
248 | const char *next_next = (p + 1 < pend) ? (p + 1) : 0; | |
249 | ||
250 | return | |
251 | ( | |
252 | /* Before a subexpression? */ | |
253 | ((syntax & RE_NO_BK_PARENS) | |
254 | ? (*next == ')') | |
255 | : (next_backslash && next_next && (*next_next == ')'))) | |
256 | || | |
257 | /* Before an alternative? */ | |
258 | ((syntax & RE_NO_BK_VBAR) | |
259 | ? (*next == '|') | |
260 | : (next_backslash && next_next && (*next_next == '|'))) | |
261 | ); | |
262 | } | |
263 | \f | |
264 | ||
265 | unsigned char rx_id_translation[256] = | |
266 | { | |
267 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, | |
268 | 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, | |
269 | 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, | |
270 | 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, | |
271 | 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, | |
272 | 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, | |
273 | 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, | |
274 | 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, | |
275 | 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, | |
276 | 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, | |
277 | ||
278 | 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, | |
279 | 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, | |
280 | 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, | |
281 | 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, | |
282 | 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, | |
283 | 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, | |
284 | 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, | |
285 | 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, | |
286 | 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, | |
287 | 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, | |
288 | ||
289 | 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, | |
290 | 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, | |
291 | 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, | |
292 | 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, | |
293 | 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, | |
294 | 250, 251, 252, 253, 254, 255 | |
295 | }; | |
296 | ||
297 | /* The compiler keeps an inverted translation table. | |
298 | * This looks up/inititalize elements. | |
299 | * VALID is an array of booleans that validate CACHE. | |
300 | */ | |
301 | ||
302 | #ifdef __STDC__ | |
303 | static rx_Bitset | |
304 | inverse_translation (int * n_members, int cset_size, char * valid, rx_Bitset cache, unsigned char * translate, int c) | |
305 | #else | |
306 | static rx_Bitset | |
307 | inverse_translation (n_members, cset_size, valid, cache, translate, c) | |
308 | int * n_members; | |
309 | int cset_size; | |
310 | char * valid; | |
311 | rx_Bitset cache; | |
312 | unsigned char * translate; | |
313 | int c; | |
314 | #endif | |
315 | { | |
316 | rx_Bitset cs; | |
317 | cs = cache + c * rx_bitset_numb_subsets (cset_size); | |
318 | ||
319 | if (!valid[c]) | |
320 | { | |
321 | int x; | |
322 | int c_tr; | |
323 | int membs; | |
324 | ||
325 | c_tr = TRANSLATE(c); | |
326 | rx_bitset_null (cset_size, cs); | |
327 | membs = 0; | |
328 | for (x = 0; x < 256; ++x) | |
329 | if (TRANSLATE(x) == c_tr) | |
330 | { | |
331 | RX_bitset_enjoin (cs, x); | |
332 | membs++; | |
333 | } | |
334 | valid[c] = 1; | |
335 | n_members[c] = membs; | |
336 | } | |
337 | return cs; | |
338 | } | |
339 | ||
340 | \f | |
341 | ||
342 | ||
343 | /* More subroutine declarations and macros for regex_compile. */ | |
344 | ||
345 | /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
346 | * false if it's not. | |
347 | */ | |
348 | ||
349 | #ifdef __STDC__ | |
350 | static int | |
351 | group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) | |
352 | #else | |
353 | static int | |
354 | group_in_compile_stack (compile_stack, regnum) | |
355 | compile_stack_type compile_stack; | |
356 | regnum_t regnum; | |
357 | #endif | |
358 | { | |
359 | int this_element; | |
360 | ||
361 | for (this_element = compile_stack.avail - 1; | |
362 | this_element >= 0; | |
363 | this_element--) | |
364 | if (compile_stack.stack[this_element].regnum == regnum) | |
365 | return 1; | |
366 | ||
367 | return 0; | |
368 | } | |
369 | ||
370 | ||
371 | /* | |
372 | * Read the ending character of a range (in a bracket expression) from the | |
373 | * uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
374 | * starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
375 | * Then we set the translation of all bits between the starting and | |
376 | * ending characters (inclusive) in the compiled pattern B. | |
377 | * | |
378 | * Return an error code. | |
379 | * | |
380 | * We use these short variable names so we can use the same macros as | |
381 | * `regex_compile' itself. | |
382 | */ | |
383 | ||
384 | #ifdef __STDC__ | |
385 | static int | |
386 | compile_range (int * n_members, int cset_size, rx_Bitset cs, const char ** p_ptr, const char * pend, unsigned char * translate, unsigned long syntax, rx_Bitset inv_tr, char * valid_inv_tr) | |
387 | #else | |
388 | static int | |
389 | compile_range (n_members, cset_size, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr) | |
390 | int * n_members; | |
391 | int cset_size; | |
392 | rx_Bitset cs; | |
393 | const char ** p_ptr; | |
394 | const char * pend; | |
395 | unsigned char * translate; | |
396 | unsigned long syntax; | |
397 | rx_Bitset inv_tr; | |
398 | char * valid_inv_tr; | |
399 | #endif | |
400 | { | |
401 | unsigned this_char; | |
402 | ||
403 | const char *p = *p_ptr; | |
404 | ||
405 | unsigned char range_end; | |
406 | unsigned char range_start = TRANSLATE(p[-2]); | |
407 | ||
408 | if (p == pend) | |
409 | return REG_ERANGE; | |
410 | ||
411 | PATFETCH (range_end); | |
412 | ||
413 | (*p_ptr)++; | |
414 | ||
415 | if (range_start > range_end) | |
416 | return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
417 | ||
418 | for (this_char = range_start; this_char <= range_end; this_char++) | |
419 | { | |
420 | rx_Bitset it = | |
421 | inverse_translation (n_members, cset_size, valid_inv_tr, inv_tr, translate, this_char); | |
422 | rx_bitset_union (cset_size, cs, it); | |
423 | } | |
424 | ||
425 | return REG_NOERROR; | |
426 | } | |
427 | \f | |
428 | ||
429 | #ifdef __STDC__ | |
430 | static int | |
431 | pointless_if_repeated (struct rexp_node * node) | |
432 | #else | |
433 | static int | |
434 | pointless_if_repeated (node) | |
435 | struct rexp_node * node; | |
436 | #endif | |
437 | { | |
438 | if (!node) | |
439 | return 1; | |
440 | switch (node->type) | |
441 | { | |
442 | case r_cset: | |
443 | case r_string: | |
444 | case r_cut: | |
445 | return 0; | |
446 | case r_concat: | |
447 | case r_alternate: | |
448 | return (pointless_if_repeated (node->params.pair.left) | |
449 | && pointless_if_repeated (node->params.pair.right)); | |
450 | case r_opt: | |
451 | case r_star: | |
452 | case r_interval: | |
453 | case r_parens: | |
454 | return pointless_if_repeated (node->params.pair.left); | |
455 | case r_context: | |
456 | switch (node->params.intval) | |
457 | { | |
458 | case '=': | |
459 | case '<': | |
460 | case '^': | |
461 | case 'b': | |
462 | case 'B': | |
463 | case '`': | |
464 | case '\'': | |
465 | return 1; | |
466 | default: | |
467 | return 0; | |
468 | } | |
469 | default: | |
470 | return 0; | |
471 | } | |
472 | } | |
473 | ||
474 | \f | |
475 | ||
476 | #ifdef __STDC__ | |
477 | static int | |
478 | factor_string (struct rexp_node *** lastp, int cset_size) | |
479 | #else | |
480 | static int | |
481 | factor_string (lastp, cset_size) | |
482 | struct rexp_node *** lastp; | |
483 | int cset_size; | |
484 | #endif | |
485 | { | |
486 | struct rexp_node ** expp; | |
487 | struct rexp_node * exp; | |
488 | rx_Bitset cs; | |
489 | struct rexp_node * cset_node; | |
490 | ||
491 | expp = *lastp; | |
492 | exp = *expp; /* presumed r_string */ | |
493 | ||
494 | cs = rx_cset (cset_size); | |
495 | if (!cs) | |
496 | return -1; | |
497 | RX_bitset_enjoin (cs, exp->params.cstr.contents[exp->params.cstr.len - 1]); | |
498 | cset_node = rx_mk_r_cset (r_cset, cset_size, cs); | |
499 | if (!cset_node) | |
500 | { | |
501 | rx_free_cset (cs); | |
502 | return -1; | |
503 | } | |
504 | if (exp->params.cstr.len == 1) | |
505 | { | |
506 | rx_free_rexp (exp); | |
507 | *expp = cset_node; | |
508 | /* lastp remains the same */ | |
509 | return 0; | |
510 | } | |
511 | else | |
512 | { | |
513 | struct rexp_node * concat_node; | |
514 | exp->params.cstr.len--; | |
515 | concat_node = rx_mk_r_binop (r_concat, exp, cset_node); | |
516 | if (!concat_node) | |
517 | { | |
518 | rx_free_rexp (cset_node); | |
519 | return -1; | |
520 | } | |
521 | *expp = concat_node; | |
522 | *lastp = &concat_node->params.pair.right; | |
523 | return 0; | |
524 | } | |
525 | } | |
526 | ||
527 | \f | |
528 | ||
529 | #define isa_blank(C) (((C) == ' ') || ((C) == '\t')) | |
530 | ||
531 | #ifdef __STDC__ | |
532 | int | |
533 | rx_parse (struct rexp_node ** rexp_p, | |
534 | const char *pattern, | |
535 | int size, | |
536 | unsigned long syntax, | |
537 | int cset_size, | |
538 | unsigned char *translate) | |
539 | #else | |
540 | int | |
541 | rx_parse (rexp_p, pattern, size, syntax, cset_size, translate) | |
542 | struct rexp_node ** rexp_p; | |
543 | const char *pattern; | |
544 | int size; | |
545 | unsigned long syntax; | |
546 | int cset_size; | |
547 | unsigned char *translate; | |
548 | #endif | |
549 | { | |
550 | int compile_error; | |
551 | RX_subset | |
552 | inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)]; | |
553 | char validate_inv_tr [CHAR_SET_SIZE]; | |
554 | int n_members [CHAR_SET_SIZE]; | |
555 | ||
556 | /* We fetch characters from PATTERN here. Even though PATTERN is | |
557 | * `char *' (i.e., signed), we declare these variables as unsigned, so | |
558 | * they can be reliably used as array indices. | |
559 | */ | |
560 | register unsigned char c; | |
561 | register unsigned char c1; | |
562 | ||
563 | /* A random tempory spot in PATTERN. */ | |
564 | const char *p1; | |
565 | ||
566 | /* Keeps track of unclosed groups. */ | |
567 | compile_stack_type compile_stack; | |
568 | ||
569 | /* Points to the current (ending) position in the pattern. */ | |
570 | const char *p; | |
571 | const char *pend; | |
572 | ||
573 | /* When parsing is done, this will hold the expression tree. */ | |
574 | struct rexp_node * rexp; | |
575 | ||
576 | /* This and top_expression are saved on the compile stack. */ | |
577 | struct rexp_node ** top_expression; | |
578 | struct rexp_node ** last_non_regular_expression; | |
579 | struct rexp_node ** last_expression; | |
580 | ||
581 | /* Parameter to `goto append_node' */ | |
582 | struct rexp_node * append; | |
583 | ||
584 | /* Counts open-groups as they are encountered. This is the index of the | |
585 | * innermost group being compiled. | |
586 | */ | |
587 | regnum_t regnum; | |
588 | ||
589 | /* True iff the sub-expression just started | |
590 | * is purely syntactic. Otherwise, a regmatch data | |
591 | * slot is allocated for the subexpression. | |
592 | */ | |
593 | int syntax_only_parens; | |
594 | ||
595 | /* Place in the uncompiled pattern (i.e., the {) to | |
596 | * which to go back if the interval is invalid. | |
597 | */ | |
598 | const char *beg_interval; | |
599 | ||
600 | int side; | |
601 | ||
602 | \f | |
603 | ||
604 | if (!translate) | |
605 | translate = rx_id_translation; | |
606 | ||
607 | /* Points to the current (ending) position in the pattern. */ | |
608 | p = pattern; | |
609 | pend = pattern + size; | |
610 | ||
611 | /* When parsing is done, this will hold the expression tree. */ | |
612 | rexp = 0; | |
613 | ||
614 | /* In the midst of compilation, this holds onto the regexp | |
615 | * first parst while rexp goes on to aquire additional constructs. | |
616 | */ | |
617 | top_expression = &rexp; | |
618 | last_non_regular_expression = top_expression; | |
619 | last_expression = top_expression; | |
620 | ||
621 | /* Counts open-groups as they are encountered. This is the index of the | |
622 | * innermost group being compiled. | |
623 | */ | |
624 | regnum = 0; | |
625 | ||
626 | rx_bzero ((char *)validate_inv_tr, sizeof (validate_inv_tr)); | |
627 | ||
628 | ||
629 | /* Initialize the compile stack. */ | |
630 | compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t))); | |
631 | if (compile_stack.stack == 0) | |
632 | return REG_ESPACE; | |
633 | ||
634 | compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
635 | compile_stack.avail = 0; | |
636 | ||
637 | #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
638 | /* Initialize the syntax table. */ | |
639 | init_syntax_once (); | |
640 | #endif | |
641 | ||
642 | /* Loop through the uncompiled pattern until we're at the end. */ | |
643 | while (p != pend) | |
644 | { | |
645 | PATFETCH (c); | |
646 | ||
647 | switch (c) | |
648 | { | |
649 | case '^': | |
650 | { | |
651 | if ( /* If at start of pattern, it's an operator. */ | |
652 | p == pattern + 1 | |
653 | /* If context independent, it's an operator. */ | |
654 | || syntax & RE_CONTEXT_INDEP_ANCHORS | |
655 | /* Otherwise, depends on what's come before. */ | |
656 | || at_begline_loc_p (pattern, p, syntax)) | |
657 | { | |
658 | struct rexp_node * n | |
659 | = rx_mk_r_int (r_context, '^'); | |
660 | if (!n) | |
661 | goto space_error; | |
662 | append = n; | |
663 | goto append_node; | |
664 | } | |
665 | else | |
666 | goto normal_char; | |
667 | } | |
668 | break; | |
669 | ||
670 | ||
671 | case '$': | |
672 | { | |
673 | if ( /* If at end of pattern, it's an operator. */ | |
674 | p == pend | |
675 | /* If context independent, it's an operator. */ | |
676 | || syntax & RE_CONTEXT_INDEP_ANCHORS | |
677 | /* Otherwise, depends on what's next. */ | |
678 | || at_endline_loc_p (p, pend, syntax)) | |
679 | { | |
680 | struct rexp_node * n | |
681 | = rx_mk_r_int (r_context, '$'); | |
682 | if (!n) | |
683 | goto space_error; | |
684 | append = n; | |
685 | goto append_node; | |
686 | } | |
687 | else | |
688 | goto normal_char; | |
689 | } | |
690 | break; | |
691 | ||
692 | ||
693 | case '+': | |
694 | case '?': | |
695 | if ((syntax & RE_BK_PLUS_QM) | |
696 | || (syntax & RE_LIMITED_OPS)) | |
697 | goto normal_char; | |
698 | ||
699 | handle_plus: | |
700 | case '*': | |
701 | /* If there is no previous pattern... */ | |
702 | if (pointless_if_repeated (*last_expression)) | |
703 | { | |
704 | if (syntax & RE_CONTEXT_INVALID_OPS) | |
705 | { | |
706 | compile_error = REG_BADRPT; | |
707 | goto error_return; | |
708 | } | |
709 | else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
710 | goto normal_char; | |
711 | } | |
712 | ||
713 | { | |
714 | /* 1 means zero (many) matches is allowed. */ | |
715 | char zero_times_ok = 0, many_times_ok = 0; | |
716 | ||
717 | /* If there is a sequence of repetition chars, collapse it | |
718 | down to just one (the right one). We can't combine | |
719 | interval operators with these because of, e.g., `a{2}*', | |
720 | which should only match an even number of `a's. */ | |
721 | ||
722 | for (;;) | |
723 | { | |
724 | zero_times_ok |= c != '+'; | |
725 | many_times_ok |= c != '?'; | |
726 | ||
727 | if (p == pend) | |
728 | break; | |
729 | ||
730 | PATFETCH (c); | |
731 | ||
732 | if (c == '*' | |
733 | || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) | |
734 | ; | |
735 | ||
736 | else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
737 | { | |
738 | if (p == pend) | |
739 | { | |
740 | compile_error = REG_EESCAPE; | |
741 | goto error_return; | |
742 | } | |
743 | ||
744 | PATFETCH (c1); | |
745 | if (!(c1 == '+' || c1 == '?')) | |
746 | { | |
747 | PATUNFETCH; | |
748 | PATUNFETCH; | |
749 | break; | |
750 | } | |
751 | ||
752 | c = c1; | |
753 | } | |
754 | else | |
755 | { | |
756 | PATUNFETCH; | |
757 | break; | |
758 | } | |
759 | ||
760 | /* If we get here, we found another repeat character. */ | |
761 | } | |
762 | ||
763 | /* Now we know whether or not zero matches is allowed | |
764 | * and also whether or not two or more matches is allowed. | |
765 | */ | |
766 | ||
767 | { | |
768 | struct rexp_node * inner_exp; | |
769 | struct rexp_node * star; | |
770 | ||
771 | if (*last_expression && ((*last_expression)->type == r_string)) | |
772 | if (factor_string (&last_expression, cset_size)) | |
773 | goto space_error; | |
774 | inner_exp = *last_expression; | |
775 | star = rx_mk_r_monop ((many_times_ok | |
776 | ? (zero_times_ok ? r_star : r_plus) | |
777 | : r_opt), | |
778 | inner_exp); | |
779 | if (!star) | |
780 | goto space_error; | |
781 | *last_expression = star; | |
782 | } | |
783 | } | |
784 | break; | |
785 | ||
786 | ||
787 | case '.': | |
788 | { | |
789 | rx_Bitset cs; | |
790 | struct rexp_node * n; | |
791 | cs = rx_cset (cset_size); | |
792 | if (!cs) | |
793 | goto space_error; | |
794 | n = rx_mk_r_cset (r_cset, cset_size, cs); | |
795 | if (!n) | |
796 | { | |
797 | rx_free_cset (cs); | |
798 | goto space_error; | |
799 | } | |
800 | rx_bitset_universe (cset_size, cs); | |
801 | if (!(syntax & RE_DOT_NEWLINE)) | |
802 | RX_bitset_remove (cs, '\n'); | |
803 | if (syntax & RE_DOT_NOT_NULL) | |
804 | RX_bitset_remove (cs, 0); | |
805 | ||
806 | append = n; | |
807 | goto append_node; | |
808 | break; | |
809 | } | |
810 | ||
811 | ||
812 | case '[': | |
813 | if (p == pend) | |
814 | { | |
815 | compile_error = REG_EBRACK; | |
816 | goto error_return; | |
817 | } | |
818 | { | |
819 | int had_char_class; | |
820 | rx_Bitset cs; | |
821 | struct rexp_node * node; | |
822 | int is_inverted; | |
823 | ||
824 | had_char_class = 0; | |
825 | is_inverted = *p == '^'; | |
826 | cs = rx_cset (cset_size); | |
827 | if (!cs) | |
828 | goto space_error; | |
829 | node = rx_mk_r_cset (r_cset, cset_size ,cs); | |
830 | if (!node) | |
831 | { | |
832 | rx_free_cset (cs); | |
833 | goto space_error; | |
834 | } | |
835 | ||
836 | /* This branch of the switch is normally exited with | |
837 | *`goto append_node' | |
838 | */ | |
839 | append = node; | |
840 | ||
841 | if (is_inverted) | |
842 | p++; | |
843 | ||
844 | /* Remember the first position in the bracket expression. */ | |
845 | p1 = p; | |
846 | ||
847 | /* Read in characters and ranges, setting map bits. */ | |
848 | for (;;) | |
849 | { | |
850 | if (p == pend) | |
851 | { | |
852 | compile_error = REG_EBRACK; | |
853 | goto error_return; | |
854 | } | |
855 | ||
856 | PATFETCH (c); | |
857 | ||
858 | /* \ might escape characters inside [...] and [^...]. */ | |
859 | if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
860 | { | |
861 | if (p == pend) | |
862 | { | |
863 | compile_error = REG_EESCAPE; | |
864 | goto error_return; | |
865 | } | |
866 | ||
867 | PATFETCH (c1); | |
868 | { | |
869 | rx_Bitset it = inverse_translation (n_members, | |
870 | cset_size, | |
871 | validate_inv_tr, | |
872 | inverse_translate, | |
873 | translate, | |
874 | c1); | |
875 | rx_bitset_union (cset_size, cs, it); | |
876 | } | |
877 | continue; | |
878 | } | |
879 | ||
880 | /* Could be the end of the bracket expression. If it's | |
881 | not (i.e., when the bracket expression is `[]' so | |
882 | far), the ']' character bit gets set way below. */ | |
883 | if (c == ']' && p != p1 + 1) | |
884 | goto finalize_class_and_append; | |
885 | ||
886 | /* Look ahead to see if it's a range when the last thing | |
887 | was a character class. */ | |
888 | if (had_char_class && c == '-' && *p != ']') | |
889 | { | |
890 | compile_error = REG_ERANGE; | |
891 | goto error_return; | |
892 | } | |
893 | ||
894 | /* Look ahead to see if it's a range when the last thing | |
895 | was a character: if this is a hyphen not at the | |
896 | beginning or the end of a list, then it's the range | |
897 | operator. */ | |
898 | if (c == '-' | |
899 | && !(p - 2 >= pattern && p[-2] == '[') | |
900 | && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') | |
901 | && *p != ']') | |
902 | { | |
903 | int ret | |
904 | = compile_range (n_members, cset_size, cs, &p, pend, translate, syntax, | |
905 | inverse_translate, validate_inv_tr); | |
906 | if (ret != REG_NOERROR) | |
907 | { | |
908 | compile_error = ret; | |
909 | goto error_return; | |
910 | } | |
911 | } | |
912 | ||
913 | else if (p[0] == '-' && p[1] != ']') | |
914 | { /* This handles ranges made up of characters only. */ | |
915 | int ret; | |
916 | ||
917 | /* Move past the `-'. */ | |
918 | PATFETCH (c1); | |
919 | ||
920 | ret = compile_range (n_members, cset_size, cs, &p, pend, translate, syntax, | |
921 | inverse_translate, validate_inv_tr); | |
922 | if (ret != REG_NOERROR) | |
923 | { | |
924 | compile_error = ret; | |
925 | goto error_return; | |
926 | } | |
927 | } | |
928 | ||
929 | /* See if we're at the beginning of a possible character | |
930 | class. */ | |
931 | ||
932 | else if ((syntax & RE_CHAR_CLASSES) | |
933 | && (c == '[') && (*p == ':')) | |
934 | { | |
935 | char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
936 | ||
937 | PATFETCH (c); | |
938 | c1 = 0; | |
939 | ||
940 | /* If pattern is `[[:'. */ | |
941 | if (p == pend) | |
942 | { | |
943 | compile_error = REG_EBRACK; | |
944 | goto error_return; | |
945 | } | |
946 | ||
947 | for (;;) | |
948 | { | |
949 | PATFETCH (c); | |
950 | if (c == ':' || c == ']' || p == pend | |
951 | || c1 == CHAR_CLASS_MAX_LENGTH) | |
952 | break; | |
953 | str[c1++] = c; | |
954 | } | |
955 | str[c1] = '\0'; | |
956 | ||
957 | /* If isn't a word bracketed by `[:' and:`]': | |
958 | undo the ending character, the letters, and leave | |
959 | the leading `:' and `[' (but set bits for them). */ | |
960 | if (c == ':' && *p == ']') | |
961 | { | |
962 | if (!strncmp (str, "cut", 3)) | |
963 | { | |
964 | int val; | |
965 | if (1 != sscanf (str + 3, " %d", &val)) | |
966 | { | |
967 | compile_error = REG_ECTYPE; | |
968 | goto error_return; | |
969 | } | |
970 | /* Throw away the ]] */ | |
971 | PATFETCH (c); | |
972 | PATFETCH (c); | |
973 | { | |
974 | struct rexp_node * cut; | |
975 | cut = rx_mk_r_int (r_cut, val); | |
976 | append = cut; | |
977 | goto append_node; | |
978 | } | |
979 | } | |
980 | else if (!strncmp (str, "(", 1)) | |
981 | { | |
982 | /* Throw away the ]] */ | |
983 | PATFETCH (c); | |
984 | PATFETCH (c); | |
985 | syntax_only_parens = 1; | |
986 | goto handle_open; | |
987 | } | |
988 | else if (!strncmp (str, ")", 1)) | |
989 | { | |
990 | /* Throw away the ]] */ | |
991 | PATFETCH (c); | |
992 | PATFETCH (c); | |
993 | syntax_only_parens = 1; | |
994 | goto handle_close; | |
995 | } | |
996 | else | |
997 | { | |
998 | int ch; | |
999 | int is_alnum = !strcmp (str, "alnum"); | |
1000 | int is_alpha = !strcmp (str, "alpha"); | |
1001 | int is_blank = !strcmp (str, "blank"); | |
1002 | int is_cntrl = !strcmp (str, "cntrl"); | |
1003 | int is_digit = !strcmp (str, "digit"); | |
1004 | int is_graph = !strcmp (str, "graph"); | |
1005 | int is_lower = !strcmp (str, "lower"); | |
1006 | int is_print = !strcmp (str, "print"); | |
1007 | int is_punct = !strcmp (str, "punct"); | |
1008 | int is_space = !strcmp (str, "space"); | |
1009 | int is_upper = !strcmp (str, "upper"); | |
1010 | int is_xdigit = !strcmp (str, "xdigit"); | |
1011 | ||
1012 | if (!IS_CHAR_CLASS (str)) | |
1013 | { | |
1014 | compile_error = REG_ECTYPE; | |
1015 | goto error_return; | |
1016 | } | |
1017 | ||
1018 | /* Throw away the ] at the end of the character | |
1019 | class. */ | |
1020 | PATFETCH (c); | |
1021 | ||
1022 | if (p == pend) { compile_error = REG_EBRACK; goto error_return; } | |
1023 | ||
1024 | for (ch = 0; ch < 1 << CHARBITS; ch++) | |
1025 | { | |
1026 | if ( (is_alnum && isalnum (ch)) | |
1027 | || (is_alpha && isalpha (ch)) | |
1028 | || (is_blank && isa_blank (ch)) | |
1029 | || (is_cntrl && iscntrl (ch)) | |
1030 | || (is_digit && isdigit (ch)) | |
1031 | || (is_graph && isgraph (ch)) | |
1032 | || (is_lower && islower (ch)) | |
1033 | || (is_print && isprint (ch)) | |
1034 | || (is_punct && ispunct (ch)) | |
1035 | || (is_space && isspace (ch)) | |
1036 | || (is_upper && isupper (ch)) | |
1037 | || (is_xdigit && isxdigit (ch))) | |
1038 | { | |
1039 | rx_Bitset it = | |
1040 | inverse_translation (n_members, | |
1041 | cset_size, | |
1042 | validate_inv_tr, | |
1043 | inverse_translate, | |
1044 | translate, | |
1045 | ch); | |
1046 | rx_bitset_union (cset_size, | |
1047 | cs, it); | |
1048 | } | |
1049 | } | |
1050 | had_char_class = 1; | |
1051 | } | |
1052 | } | |
1053 | else | |
1054 | { | |
1055 | c1++; | |
1056 | while (c1--) | |
1057 | PATUNFETCH; | |
1058 | { | |
1059 | rx_Bitset it = | |
1060 | inverse_translation (n_members, | |
1061 | cset_size, | |
1062 | validate_inv_tr, | |
1063 | inverse_translate, | |
1064 | translate, | |
1065 | '['); | |
1066 | rx_bitset_union (cset_size, | |
1067 | cs, it); | |
1068 | } | |
1069 | { | |
1070 | rx_Bitset it = | |
1071 | inverse_translation (n_members, | |
1072 | cset_size, | |
1073 | validate_inv_tr, | |
1074 | inverse_translate, | |
1075 | translate, | |
1076 | ':'); | |
1077 | rx_bitset_union (cset_size, | |
1078 | cs, it); | |
1079 | } | |
1080 | had_char_class = 0; | |
1081 | } | |
1082 | } | |
1083 | else | |
1084 | { | |
1085 | had_char_class = 0; | |
1086 | { | |
1087 | rx_Bitset it = inverse_translation (n_members, | |
1088 | cset_size, | |
1089 | validate_inv_tr, | |
1090 | inverse_translate, | |
1091 | translate, | |
1092 | c); | |
1093 | rx_bitset_union (cset_size, cs, it); | |
1094 | } | |
1095 | } | |
1096 | } | |
1097 | ||
1098 | finalize_class_and_append: | |
1099 | if (is_inverted) | |
1100 | { | |
1101 | rx_bitset_complement (cset_size, cs); | |
1102 | if (syntax & RE_HAT_LISTS_NOT_NEWLINE) | |
1103 | RX_bitset_remove (cs, '\n'); | |
1104 | } | |
1105 | goto append_node; | |
1106 | } | |
1107 | break; | |
1108 | ||
1109 | ||
1110 | case '(': | |
1111 | if (syntax & RE_NO_BK_PARENS) | |
1112 | { | |
1113 | syntax_only_parens = 0; | |
1114 | goto handle_open; | |
1115 | } | |
1116 | else | |
1117 | goto normal_char; | |
1118 | ||
1119 | ||
1120 | case ')': | |
1121 | if (syntax & RE_NO_BK_PARENS) | |
1122 | { | |
1123 | syntax_only_parens = 0; | |
1124 | goto handle_close; | |
1125 | } | |
1126 | else | |
1127 | goto normal_char; | |
1128 | ||
1129 | ||
1130 | case '\n': | |
1131 | if (syntax & RE_NEWLINE_ALT) | |
1132 | goto handle_alt; | |
1133 | else | |
1134 | goto normal_char; | |
1135 | ||
1136 | ||
1137 | case '|': | |
1138 | if (syntax & RE_NO_BK_VBAR) | |
1139 | goto handle_alt; | |
1140 | else | |
1141 | goto normal_char; | |
1142 | ||
1143 | ||
1144 | case '{': | |
1145 | if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
1146 | goto handle_interval; | |
1147 | else | |
1148 | goto normal_char; | |
1149 | ||
1150 | ||
1151 | case '\\': | |
1152 | if (p == pend) { compile_error = REG_EESCAPE; goto error_return; } | |
1153 | ||
1154 | /* Do not translate the character after the \, so that we can | |
1155 | distinguish, e.g., \B from \b, even if we normally would | |
1156 | translate, e.g., B to b. */ | |
1157 | PATFETCH_RAW (c); | |
1158 | ||
1159 | switch (c) | |
1160 | { | |
1161 | case '(': | |
1162 | if (syntax & RE_NO_BK_PARENS) | |
1163 | goto normal_backslash; | |
1164 | ||
1165 | syntax_only_parens = 0; | |
1166 | ||
1167 | handle_open: | |
1168 | if (!syntax_only_parens) | |
1169 | regnum++; | |
1170 | ||
1171 | if (COMPILE_STACK_FULL) | |
1172 | { | |
1173 | compile_stack.stack | |
1174 | = ((compile_stack_elt_t *) | |
1175 | realloc (compile_stack.stack, | |
1176 | (compile_stack.size << 1) * sizeof (compile_stack_elt_t))); | |
1177 | if (compile_stack.stack == 0) | |
1178 | goto space_error; | |
1179 | compile_stack.size <<= 1; | |
1180 | } | |
1181 | ||
1182 | if (*last_non_regular_expression) | |
1183 | { | |
1184 | struct rexp_node * concat; | |
1185 | concat = rx_mk_r_binop (r_concat, *last_non_regular_expression, 0); | |
1186 | if (!concat) | |
1187 | goto space_error; | |
1188 | *last_non_regular_expression = concat; | |
1189 | last_non_regular_expression = &concat->params.pair.right; | |
1190 | last_expression = last_non_regular_expression; | |
1191 | } | |
1192 | ||
1193 | /* | |
1194 | * These are the values to restore when we hit end of this | |
1195 | * group. | |
1196 | */ | |
1197 | COMPILE_STACK_TOP.top_expression = top_expression; | |
1198 | COMPILE_STACK_TOP.last_expression = last_expression; | |
1199 | COMPILE_STACK_TOP.last_non_regular_expression = last_non_regular_expression; | |
1200 | ||
1201 | if (syntax_only_parens) | |
1202 | COMPILE_STACK_TOP.regnum = -1; | |
1203 | else | |
1204 | COMPILE_STACK_TOP.regnum = regnum; | |
1205 | ||
1206 | compile_stack.avail++; | |
1207 | ||
1208 | top_expression = last_non_regular_expression; | |
1209 | break; | |
1210 | ||
1211 | ||
1212 | case ')': | |
1213 | if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
1214 | syntax_only_parens = 0; | |
1215 | ||
1216 | handle_close: | |
1217 | /* See similar code for backslashed left paren above. */ | |
1218 | if (COMPILE_STACK_EMPTY) | |
1219 | if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
1220 | goto normal_char; | |
1221 | else | |
1222 | { compile_error = REG_ERPAREN; goto error_return; } | |
1223 | ||
1224 | /* Since we just checked for an empty stack above, this | |
1225 | * ``can't happen''. | |
1226 | */ | |
1227 | ||
1228 | { | |
1229 | /* We don't just want to restore into `regnum', because | |
1230 | * later groups should continue to be numbered higher, | |
1231 | * as in `(ab)c(de)' -- the second group is #2. | |
1232 | */ | |
1233 | regnum_t this_group_regnum; | |
1234 | struct rexp_node ** inner; | |
1235 | struct rexp_node * parens; | |
1236 | ||
1237 | inner = top_expression; | |
1238 | compile_stack.avail--; | |
1239 | ||
1240 | if (!!syntax_only_parens != (COMPILE_STACK_TOP.regnum == -1)) | |
1241 | { compile_error = REG_ERPAREN; goto error_return; } | |
1242 | ||
1243 | top_expression = COMPILE_STACK_TOP.top_expression; | |
1244 | last_expression = COMPILE_STACK_TOP.last_expression; | |
1245 | last_non_regular_expression = COMPILE_STACK_TOP.last_non_regular_expression; | |
1246 | this_group_regnum = COMPILE_STACK_TOP.regnum; | |
1247 | ||
1248 | { | |
1249 | parens = rx_mk_r_monop (r_parens, *inner); | |
1250 | if (!parens) | |
1251 | goto space_error; | |
1252 | parens->params.intval = this_group_regnum; | |
1253 | *inner = parens; | |
1254 | break; | |
1255 | } | |
1256 | } | |
1257 | ||
1258 | case '|': /* `\|'. */ | |
1259 | if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR)) | |
1260 | goto normal_backslash; | |
1261 | handle_alt: | |
1262 | if (syntax & RE_LIMITED_OPS) | |
1263 | goto normal_char; | |
1264 | ||
1265 | { | |
1266 | struct rexp_node * alt; | |
1267 | ||
1268 | alt = rx_mk_r_binop (r_alternate, *top_expression, 0); | |
1269 | if (!alt) | |
1270 | goto space_error; | |
1271 | *top_expression = alt; | |
1272 | last_expression = &alt->params.pair.right; | |
1273 | last_non_regular_expression = &alt->params.pair.right; | |
1274 | } | |
1275 | break; | |
1276 | ||
1277 | ||
1278 | case '{': | |
1279 | /* If \{ is a literal. */ | |
1280 | if (!(syntax & RE_INTERVALS) | |
1281 | /* If we're at `\{' and it's not the open-interval | |
1282 | operator. */ | |
1283 | || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
1284 | || (p - 2 == pattern && p == pend)) | |
1285 | goto normal_backslash; | |
1286 | ||
1287 | handle_interval: | |
1288 | { | |
1289 | /* If got here, then the syntax allows intervals. | |
1290 | */ | |
1291 | ||
1292 | /* At least (most) this many matches must be made. | |
1293 | */ | |
1294 | int lower_bound; | |
1295 | int upper_bound; | |
1296 | ||
1297 | lower_bound = -1; | |
1298 | upper_bound = -1; | |
1299 | ||
1300 | /* We're about to parse the bounds of the interval. | |
1301 | * It may turn out that this isn't an interval after | |
1302 | * all, in which case these same characters will have | |
1303 | * to be reparsed as literals. This remembers | |
1304 | * the backtrack point in the parse: | |
1305 | */ | |
1306 | beg_interval = p - 1; | |
1307 | ||
1308 | if (p == pend) | |
1309 | { | |
1310 | if (syntax & RE_NO_BK_BRACES) | |
1311 | goto unfetch_interval; | |
1312 | else | |
1313 | { compile_error = REG_EBRACE; goto error_return; } | |
1314 | } | |
1315 | ||
1316 | GET_UNSIGNED_NUMBER (lower_bound); | |
1317 | ||
1318 | if (c == ',') | |
1319 | { | |
1320 | GET_UNSIGNED_NUMBER (upper_bound); | |
1321 | if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
1322 | } | |
1323 | else | |
1324 | /* Interval such as `{n}' => match exactly n times. | |
1325 | */ | |
1326 | upper_bound = lower_bound; | |
1327 | ||
1328 | if (lower_bound < 0 | |
1329 | || upper_bound > RE_DUP_MAX | |
1330 | || lower_bound > upper_bound) | |
1331 | { | |
1332 | if (syntax & RE_NO_BK_BRACES) | |
1333 | goto unfetch_interval; | |
1334 | else | |
1335 | { compile_error = REG_BADBR; goto error_return; } | |
1336 | } | |
1337 | ||
1338 | if (!(syntax & RE_NO_BK_BRACES)) | |
1339 | { | |
1340 | if (c != '\\') { compile_error = REG_EBRACE; goto error_return; } | |
1341 | PATFETCH (c); | |
1342 | } | |
1343 | ||
1344 | if (c != '}') | |
1345 | { | |
1346 | if (syntax & RE_NO_BK_BRACES) | |
1347 | goto unfetch_interval; | |
1348 | else | |
1349 | { compile_error = REG_BADBR; goto error_return; } | |
1350 | } | |
1351 | ||
1352 | /* We just parsed a valid interval. | |
1353 | * lower_bound and upper_bound are set. | |
1354 | */ | |
1355 | ||
1356 | /* If it's invalid to have no preceding re. | |
1357 | */ | |
1358 | if (pointless_if_repeated (*last_expression)) | |
1359 | { | |
1360 | if (syntax & RE_CONTEXT_INVALID_OPS) | |
1361 | { compile_error = REG_BADRPT; goto error_return; } | |
1362 | else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
1363 | /* treat the interval spec as literal chars. */ | |
1364 | goto unfetch_interval; | |
1365 | } | |
1366 | ||
1367 | { | |
1368 | struct rexp_node * interval; | |
1369 | ||
1370 | if (*last_expression && ((*last_expression)->type == r_string)) | |
1371 | if (factor_string (&last_expression, cset_size)) | |
1372 | goto space_error; | |
1373 | interval = rx_mk_r_monop (r_interval, *last_expression); | |
1374 | if (!interval) | |
1375 | goto space_error; | |
1376 | interval->params.intval = lower_bound; | |
1377 | interval->params.intval2 = upper_bound; | |
1378 | *last_expression = interval; | |
1379 | last_non_regular_expression = last_expression; | |
1380 | } | |
1381 | beg_interval = 0; | |
1382 | } | |
1383 | break; | |
1384 | ||
1385 | unfetch_interval: | |
1386 | /* If an invalid interval, match the characters as literals. */ | |
1387 | p = beg_interval; | |
1388 | beg_interval = 0; | |
1389 | ||
1390 | /* normal_char and normal_backslash need `c'. */ | |
1391 | PATFETCH (c); | |
1392 | ||
1393 | if (!(syntax & RE_NO_BK_BRACES)) | |
1394 | { | |
1395 | if ((p > pattern) && (p[-1] == '\\')) | |
1396 | goto normal_backslash; | |
1397 | } | |
1398 | goto normal_char; | |
1399 | ||
1400 | #ifdef emacs | |
1401 | /* There is no way to specify the before_dot and after_dot | |
1402 | * operators. rms says this is ok. --karl | |
1403 | */ | |
1404 | case '=': | |
1405 | side = '='; | |
1406 | goto add_side_effect; | |
1407 | break; | |
1408 | ||
1409 | case 's': | |
1410 | case 'S': | |
1411 | { | |
1412 | rx_Bitset cs; | |
1413 | struct rexp_node * set; | |
1414 | ||
1415 | cs = rx_cset (&cset_size); | |
1416 | if (!cs) | |
1417 | goto space_error; | |
1418 | set = rx_mk_r_cset (r_cset, cset_size, cs); | |
1419 | if (!set) | |
1420 | { | |
1421 | rx_free_cset (cs); | |
1422 | goto space_error; | |
1423 | } | |
1424 | if (c == 'S') | |
1425 | rx_bitset_universe (cset_size, cs); | |
1426 | ||
1427 | PATFETCH (c); | |
1428 | { | |
1429 | int x; | |
1430 | enum syntaxcode code = syntax_spec_code [c]; | |
1431 | for (x = 0; x < 256; ++x) | |
1432 | { | |
1433 | ||
1434 | if (SYNTAX (x) == code) | |
1435 | { | |
1436 | rx_Bitset it = | |
1437 | inverse_translation (n_members, | |
1438 | cset_size, validate_inv_tr, | |
1439 | inverse_translate, | |
1440 | translate, x); | |
1441 | rx_bitset_xor (cset_size, cs, it); | |
1442 | } | |
1443 | } | |
1444 | } | |
1445 | append = set; | |
1446 | goto append_node; | |
1447 | } | |
1448 | break; | |
1449 | #endif /* emacs */ | |
1450 | ||
1451 | ||
1452 | case 'w': | |
1453 | case 'W': | |
1454 | { | |
1455 | rx_Bitset cs; | |
1456 | struct rexp_node * n; | |
1457 | ||
1458 | cs = rx_cset (cset_size); | |
1459 | n = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0); | |
1460 | if (!(cs && n)) | |
1461 | { | |
1462 | if (cs) | |
1463 | rx_free_cset (cs); | |
1464 | goto space_error; | |
1465 | } | |
1466 | if (c == 'W') | |
1467 | rx_bitset_universe (cset_size ,cs); | |
1468 | { | |
1469 | int x; | |
1470 | for (x = cset_size - 1; x > 0; --x) | |
1471 | if (SYNTAX(x) & Sword) | |
1472 | RX_bitset_toggle (cs, x); | |
1473 | } | |
1474 | append = n; | |
1475 | goto append_node; | |
1476 | } | |
1477 | break; | |
1478 | ||
1479 | case '<': | |
1480 | side = '<'; | |
1481 | goto add_side_effect; | |
1482 | break; | |
1483 | ||
1484 | case '>': | |
1485 | side = '>'; | |
1486 | goto add_side_effect; | |
1487 | break; | |
1488 | ||
1489 | case 'b': | |
1490 | side = 'b'; | |
1491 | goto add_side_effect; | |
1492 | break; | |
1493 | ||
1494 | case 'B': | |
1495 | side = 'B'; | |
1496 | goto add_side_effect; | |
1497 | break; | |
1498 | ||
1499 | case '`': | |
1500 | side = '`'; | |
1501 | goto add_side_effect; | |
1502 | break; | |
1503 | ||
1504 | case '\'': | |
1505 | side = '\''; | |
1506 | goto add_side_effect; | |
1507 | break; | |
1508 | ||
1509 | add_side_effect: | |
1510 | { | |
1511 | struct rexp_node * se; | |
1512 | se = rx_mk_r_int (r_context, side); | |
1513 | if (!se) | |
1514 | goto space_error; | |
1515 | append = se; | |
1516 | goto append_node; | |
1517 | } | |
1518 | break; | |
1519 | ||
1520 | case '1': case '2': case '3': case '4': case '5': | |
1521 | case '6': case '7': case '8': case '9': | |
1522 | if (syntax & RE_NO_BK_REFS) | |
1523 | goto normal_char; | |
1524 | ||
1525 | c1 = c - '0'; | |
1526 | ||
1527 | /* Can't back reference to a subexpression if inside of it. */ | |
1528 | if (group_in_compile_stack (compile_stack, c1)) | |
1529 | goto normal_char; | |
1530 | ||
1531 | if (c1 > regnum) | |
1532 | { compile_error = REG_ESUBREG; goto error_return; } | |
1533 | ||
1534 | side = c; | |
1535 | goto add_side_effect; | |
1536 | break; | |
1537 | ||
1538 | case '+': | |
1539 | case '?': | |
1540 | if (syntax & RE_BK_PLUS_QM) | |
1541 | goto handle_plus; | |
1542 | else | |
1543 | goto normal_backslash; | |
1544 | ||
1545 | default: | |
1546 | normal_backslash: | |
1547 | /* You might think it would be useful for \ to mean | |
1548 | * not to translate; but if we don't translate it | |
1549 | * it will never match anything. | |
1550 | */ | |
1551 | c = TRANSLATE (c); | |
1552 | goto normal_char; | |
1553 | } | |
1554 | break; | |
1555 | ||
1556 | ||
1557 | default: | |
1558 | /* Expects the character in `c'. */ | |
1559 | normal_char: | |
1560 | { | |
1561 | rx_Bitset cs; | |
1562 | struct rexp_node * match; | |
1563 | rx_Bitset it; | |
1564 | ||
1565 | it = inverse_translation (n_members, | |
1566 | cset_size, validate_inv_tr, | |
1567 | inverse_translate, translate, c); | |
1568 | ||
1569 | if (1 != n_members[c]) | |
1570 | { | |
1571 | cs = rx_cset (cset_size); | |
1572 | match = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0); | |
1573 | if (!(cs && match)) | |
1574 | { | |
1575 | if (cs) | |
1576 | rx_free_cset (cs); | |
1577 | goto space_error; | |
1578 | } | |
1579 | rx_bitset_union (CHAR_SET_SIZE, cs, it); | |
1580 | append = match; | |
1581 | goto append_node; | |
1582 | } | |
1583 | else | |
1584 | { | |
1585 | if (*last_expression && (*last_expression)->type == r_string) | |
1586 | { | |
1587 | if (rx_adjoin_string (&((*last_expression)->params.cstr), c)) | |
1588 | goto space_error; | |
1589 | break; | |
1590 | } | |
1591 | else | |
1592 | { | |
1593 | append = rx_mk_r_str (r_string, c); | |
1594 | if(!append) | |
1595 | goto space_error; | |
1596 | goto append_node; | |
1597 | } | |
1598 | } | |
1599 | break; | |
1600 | ||
1601 | append_node: | |
1602 | /* This genericly appends the rexp APPEND to *LAST_EXPRESSION | |
1603 | * and then parses the next character normally. | |
1604 | */ | |
1605 | if (RX_regular_node_type (append->type)) | |
1606 | { | |
1607 | if (!*last_expression) | |
1608 | *last_expression = append; | |
1609 | else | |
1610 | { | |
1611 | struct rexp_node * concat; | |
1612 | concat = rx_mk_r_binop (r_concat, | |
1613 | *last_expression, append); | |
1614 | if (!concat) | |
1615 | goto space_error; | |
1616 | *last_expression = concat; | |
1617 | last_expression = &concat->params.pair.right; | |
1618 | } | |
1619 | } | |
1620 | else | |
1621 | { | |
1622 | if (!*last_non_regular_expression) | |
1623 | { | |
1624 | *last_non_regular_expression = append; | |
1625 | last_expression = last_non_regular_expression; | |
1626 | } | |
1627 | else | |
1628 | { | |
1629 | struct rexp_node * concat; | |
1630 | concat = rx_mk_r_binop (r_concat, | |
1631 | *last_non_regular_expression, append); | |
1632 | if (!concat) | |
1633 | goto space_error; | |
1634 | *last_non_regular_expression = concat; | |
1635 | last_non_regular_expression = &concat->params.pair.right; | |
1636 | last_expression = last_non_regular_expression; | |
1637 | } | |
1638 | } | |
1639 | } | |
1640 | } /* switch (c) */ | |
1641 | } /* while p != pend */ | |
1642 | ||
1643 | ||
1644 | /* Through the pattern now. */ | |
1645 | ||
1646 | if (!COMPILE_STACK_EMPTY) | |
1647 | { compile_error = REG_EPAREN; goto error_return; } | |
1648 | free (compile_stack.stack); | |
1649 | ||
1650 | ||
1651 | *rexp_p = rexp; | |
1652 | return REG_NOERROR; | |
1653 | ||
1654 | space_error: | |
1655 | compile_error = REG_ESPACE; | |
1656 | ||
1657 | error_return: | |
1658 | free (compile_stack.stack); | |
1659 | /* Free expressions pushed onto the compile stack! */ | |
1660 | if (rexp) | |
1661 | rx_free_rexp (rexp); | |
1662 | return compile_error; | |
1663 | } | |
1664 | ||
1665 |