]> jfr.im git - irc/evilnet/x3.git/blame - rx/rxgnucomp.c
Removed extra 'is' from CSMSG_SMURF_TARGET
[irc/evilnet/x3.git] / rx / rxgnucomp.c
CommitLineData
d76ed9a9 1/* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU Library General Public License as published by
5 * the Free Software Foundation; either version 2, or (at your option)
6 * any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Library General Public License for more details.
12 *
13 * You should have received a copy of the GNU Library General Public License
14 * along with this software; see the file COPYING. If not, write to
15 * the Free Software Foundation, 59 Temple Place - Suite 330,
16 * Boston, MA 02111-1307, USA.
17 */
18
19\f
20#include <sys/types.h>
21#include "rxall.h"
22#include "rxgnucomp.h"
23#include "inst-rxposix.h"
24
25\f
26/* {A Syntax Table}
27 */
28
29/* Define the syntax basics for \<, \>, etc.
30 */
31
32#ifndef emacs
33#define CHARBITS 8
34#define CHAR_SET_SIZE (1 << CHARBITS)
35#define Sword 1
36#define SYNTAX(c) re_syntax_table[c]
37char re_syntax_table[CHAR_SET_SIZE];
38
39#ifdef __STDC__
40static void
41init_syntax_once (void)
42#else
43static void
44init_syntax_once ()
45#endif
46{
47 register int c;
48 static int done = 0;
49
50 if (done)
51 return;
52
53 rx_bzero ((char *)re_syntax_table, sizeof re_syntax_table);
54
55 for (c = 'a'; c <= 'z'; c++)
56 re_syntax_table[c] = Sword;
57
58 for (c = 'A'; c <= 'Z'; c++)
59 re_syntax_table[c] = Sword;
60
61 for (c = '0'; c <= '9'; c++)
62 re_syntax_table[c] = Sword;
63
64 re_syntax_table['_'] = Sword;
65
66 done = 1;
67}
68#endif /* not emacs */
69
70\f
71
72
73
74const char *rx_error_msg[] =
75{
76 0, /* REG_NOUT */
77 "No match", /* REG_NOMATCH */
78 "Invalid regular expression", /* REG_BADPAT */
79 "Invalid collation character", /* REG_ECOLLATE */
80 "Invalid character class name", /* REG_ECTYPE */
81 "Trailing backslash", /* REG_EESCAPE */
82 "Invalid back reference", /* REG_ESUBREG */
83 "Unmatched [ or [^", /* REG_EBRACK */
84 "Unmatched ( or \\(", /* REG_EPAREN */
85 "Unmatched \\{", /* REG_EBRACE */
86 "Invalid content of \\{\\}", /* REG_BADBR */
87 "Invalid range end", /* REG_ERANGE */
88 "Memory exhausted", /* REG_ESPACE */
89 "Invalid preceding regular expression", /* REG_BADRPT */
90 "Premature end of regular expression", /* REG_EEND */
91 "Regular expression too big", /* REG_ESIZE */
92 "Unmatched ) or \\)", /* REG_ERPAREN */
93};
94
95
96\f
97/*
98 * Macros used while compiling patterns.
99 *
100 * By convention, PEND points just past the end of the uncompiled pattern,
101 * P points to the read position in the pattern. `translate' is the name
102 * of the translation table (`TRANSLATE' is the name of a macro that looks
103 * things up in `translate').
104 */
105
106
107/*
108 * Fetch the next character in the uncompiled pattern---translating it
109 * if necessary. *Also cast from a signed character in the constant
110 * string passed to us by the user to an unsigned char that we can use
111 * as an array index (in, e.g., `translate').
112 */
113#define PATFETCH(c) \
114 do {if (p == pend) return REG_EEND; \
115 c = (unsigned char) *p++; \
116 c = translate[c]; \
117 } while (0)
118
119/*
120 * Fetch the next character in the uncompiled pattern, with no
121 * translation.
122 */
123#define PATFETCH_RAW(c) \
124 do {if (p == pend) return REG_EEND; \
125 c = (unsigned char) *p++; \
126 } while (0)
127
128/* Go backwards one character in the pattern. */
129#define PATUNFETCH p--
130
131
132#define TRANSLATE(d) translate[(unsigned char) (d)]
133
134typedef int regnum_t;
135
136/* Since offsets can go either forwards or backwards, this type needs to
137 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
138 */
139typedef int pattern_offset_t;
140
141typedef struct
142{
143 struct rexp_node ** top_expression;
144 struct rexp_node ** last_expression;
145 struct rexp_node ** last_non_regular_expression;
146 pattern_offset_t inner_group_offset;
147 regnum_t regnum;
148} compile_stack_elt_t;
149
150typedef struct
151{
152 compile_stack_elt_t *stack;
153 unsigned size;
154 unsigned avail; /* Offset of next open position. */
155} compile_stack_type;
156
157
158#define INIT_COMPILE_STACK_SIZE 32
159
160#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
161#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
162
163/* The next available element. */
164#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
165
166
167/* Set the bit for character C in a list. */
168#define SET_LIST_BIT(c) \
169 (b[((unsigned char) (c)) / CHARBITS] \
170 |= 1 << (((unsigned char) c) % CHARBITS))
171
172/* Get the next unsigned number in the uncompiled pattern. */
173#define GET_UNSIGNED_NUMBER(num) \
174 { if (p != pend) \
175 { \
176 PATFETCH (c); \
177 while (isdigit (c)) \
178 { \
179 if (num < 0) \
180 num = 0; \
181 num = num * 10 + c - '0'; \
182 if (p == pend) \
183 break; \
184 PATFETCH (c); \
185 } \
186 } \
187 }
188
189#define CHAR_CLASS_MAX_LENGTH 64
190
191#define IS_CHAR_CLASS(string) \
192 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
193 || !strcmp (string, "lower") || !strcmp (string, "digit") \
194 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
195 || !strcmp (string, "space") || !strcmp (string, "print") \
196 || !strcmp (string, "punct") || !strcmp (string, "graph") \
197 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
198
199\f
200/* These predicates are used in regex_compile. */
201
202/* P points to just after a ^ in PATTERN. Return true if that ^ comes
203 * after an alternative or a begin-subexpression. We assume there is at
204 * least one character before the ^.
205 */
206
207#ifdef __STDC__
208static int
209at_begline_loc_p (const char *pattern, const char * p, unsigned long syntax)
210#else
211static int
212at_begline_loc_p (pattern, p, syntax)
213 const char *pattern;
214 const char * p;
215 unsigned long syntax;
216#endif
217{
218 const char *prev = p - 2;
219 int prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
220
221 return
222
223 (/* After a subexpression? */
224 ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
225 ||
226 /* After an alternative? */
227 ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
228 );
229}
230
231/* The dual of at_begline_loc_p. This one is for $. We assume there is
232 * at least one character after the $, i.e., `P < PEND'.
233 */
234
235#ifdef __STDC__
236static int
237at_endline_loc_p (const char *p, const char *pend, int syntax)
238#else
239static int
240at_endline_loc_p (p, pend, syntax)
241 const char *p;
242 const char *pend;
243 int syntax;
244#endif
245{
246 const char *next = p;
247 int next_backslash = (*next == '\\');
248 const char *next_next = (p + 1 < pend) ? (p + 1) : 0;
249
250 return
251 (
252 /* Before a subexpression? */
253 ((syntax & RE_NO_BK_PARENS)
254 ? (*next == ')')
255 : (next_backslash && next_next && (*next_next == ')')))
256 ||
257 /* Before an alternative? */
258 ((syntax & RE_NO_BK_VBAR)
259 ? (*next == '|')
260 : (next_backslash && next_next && (*next_next == '|')))
261 );
262}
263\f
264
265unsigned char rx_id_translation[256] =
266{
267 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
268 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
269 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
270 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
271 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
272 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
273 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
274 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
275 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
276 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
277
278 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
279 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
280 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
281 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
282 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
283 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
284 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
285 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
286 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
287 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
288
289 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
290 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
291 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
292 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
293 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
294 250, 251, 252, 253, 254, 255
295};
296
297/* The compiler keeps an inverted translation table.
298 * This looks up/inititalize elements.
299 * VALID is an array of booleans that validate CACHE.
300 */
301
302#ifdef __STDC__
303static rx_Bitset
304inverse_translation (int * n_members, int cset_size, char * valid, rx_Bitset cache, unsigned char * translate, int c)
305#else
306static rx_Bitset
307inverse_translation (n_members, cset_size, valid, cache, translate, c)
308 int * n_members;
309 int cset_size;
310 char * valid;
311 rx_Bitset cache;
312 unsigned char * translate;
313 int c;
314#endif
315{
316 rx_Bitset cs;
317 cs = cache + c * rx_bitset_numb_subsets (cset_size);
318
319 if (!valid[c])
320 {
321 int x;
322 int c_tr;
323 int membs;
324
325 c_tr = TRANSLATE(c);
326 rx_bitset_null (cset_size, cs);
327 membs = 0;
328 for (x = 0; x < 256; ++x)
329 if (TRANSLATE(x) == c_tr)
330 {
331 RX_bitset_enjoin (cs, x);
332 membs++;
333 }
334 valid[c] = 1;
335 n_members[c] = membs;
336 }
337 return cs;
338}
339
340\f
341
342
343/* More subroutine declarations and macros for regex_compile. */
344
345/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
346 * false if it's not.
347 */
348
349#ifdef __STDC__
350static int
351group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
352#else
353static int
354group_in_compile_stack (compile_stack, regnum)
355 compile_stack_type compile_stack;
356 regnum_t regnum;
357#endif
358{
359 int this_element;
360
361 for (this_element = compile_stack.avail - 1;
362 this_element >= 0;
363 this_element--)
364 if (compile_stack.stack[this_element].regnum == regnum)
365 return 1;
366
367 return 0;
368}
369
370
371/*
372 * Read the ending character of a range (in a bracket expression) from the
373 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
374 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
375 * Then we set the translation of all bits between the starting and
376 * ending characters (inclusive) in the compiled pattern B.
377 *
378 * Return an error code.
379 *
380 * We use these short variable names so we can use the same macros as
381 * `regex_compile' itself.
382 */
383
384#ifdef __STDC__
385static int
386compile_range (int * n_members, int cset_size, rx_Bitset cs, const char ** p_ptr, const char * pend, unsigned char * translate, unsigned long syntax, rx_Bitset inv_tr, char * valid_inv_tr)
387#else
388static int
389compile_range (n_members, cset_size, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
390 int * n_members;
391 int cset_size;
392 rx_Bitset cs;
393 const char ** p_ptr;
394 const char * pend;
395 unsigned char * translate;
396 unsigned long syntax;
397 rx_Bitset inv_tr;
398 char * valid_inv_tr;
399#endif
400{
401 unsigned this_char;
402
403 const char *p = *p_ptr;
404
405 unsigned char range_end;
406 unsigned char range_start = TRANSLATE(p[-2]);
407
408 if (p == pend)
409 return REG_ERANGE;
410
411 PATFETCH (range_end);
412
413 (*p_ptr)++;
414
415 if (range_start > range_end)
416 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
417
418 for (this_char = range_start; this_char <= range_end; this_char++)
419 {
420 rx_Bitset it =
421 inverse_translation (n_members, cset_size, valid_inv_tr, inv_tr, translate, this_char);
422 rx_bitset_union (cset_size, cs, it);
423 }
424
425 return REG_NOERROR;
426}
427\f
428
429#ifdef __STDC__
430static int
431pointless_if_repeated (struct rexp_node * node)
432#else
433static int
434pointless_if_repeated (node)
435 struct rexp_node * node;
436#endif
437{
438 if (!node)
439 return 1;
440 switch (node->type)
441 {
442 case r_cset:
443 case r_string:
444 case r_cut:
445 return 0;
446 case r_concat:
447 case r_alternate:
448 return (pointless_if_repeated (node->params.pair.left)
449 && pointless_if_repeated (node->params.pair.right));
450 case r_opt:
451 case r_star:
452 case r_interval:
453 case r_parens:
454 return pointless_if_repeated (node->params.pair.left);
455 case r_context:
456 switch (node->params.intval)
457 {
458 case '=':
459 case '<':
460 case '^':
461 case 'b':
462 case 'B':
463 case '`':
464 case '\'':
465 return 1;
466 default:
467 return 0;
468 }
469 default:
470 return 0;
471 }
472}
473
474\f
475
476#ifdef __STDC__
477static int
478factor_string (struct rexp_node *** lastp, int cset_size)
479#else
480static int
481factor_string (lastp, cset_size)
482 struct rexp_node *** lastp;
483 int cset_size;
484#endif
485{
486 struct rexp_node ** expp;
487 struct rexp_node * exp;
488 rx_Bitset cs;
489 struct rexp_node * cset_node;
490
491 expp = *lastp;
492 exp = *expp; /* presumed r_string */
493
494 cs = rx_cset (cset_size);
495 if (!cs)
496 return -1;
497 RX_bitset_enjoin (cs, exp->params.cstr.contents[exp->params.cstr.len - 1]);
498 cset_node = rx_mk_r_cset (r_cset, cset_size, cs);
499 if (!cset_node)
500 {
501 rx_free_cset (cs);
502 return -1;
503 }
504 if (exp->params.cstr.len == 1)
505 {
506 rx_free_rexp (exp);
507 *expp = cset_node;
508 /* lastp remains the same */
509 return 0;
510 }
511 else
512 {
513 struct rexp_node * concat_node;
514 exp->params.cstr.len--;
515 concat_node = rx_mk_r_binop (r_concat, exp, cset_node);
516 if (!concat_node)
517 {
518 rx_free_rexp (cset_node);
519 return -1;
520 }
521 *expp = concat_node;
522 *lastp = &concat_node->params.pair.right;
523 return 0;
524 }
525}
526
527\f
528
529#define isa_blank(C) (((C) == ' ') || ((C) == '\t'))
530
531#ifdef __STDC__
532int
533rx_parse (struct rexp_node ** rexp_p,
534 const char *pattern,
535 int size,
536 unsigned long syntax,
537 int cset_size,
538 unsigned char *translate)
539#else
540int
541rx_parse (rexp_p, pattern, size, syntax, cset_size, translate)
542 struct rexp_node ** rexp_p;
543 const char *pattern;
544 int size;
545 unsigned long syntax;
546 int cset_size;
547 unsigned char *translate;
548#endif
549{
550 int compile_error;
551 RX_subset
552 inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
553 char validate_inv_tr [CHAR_SET_SIZE];
554 int n_members [CHAR_SET_SIZE];
555
556 /* We fetch characters from PATTERN here. Even though PATTERN is
557 * `char *' (i.e., signed), we declare these variables as unsigned, so
558 * they can be reliably used as array indices.
559 */
560 register unsigned char c;
561 register unsigned char c1;
562
563 /* A random tempory spot in PATTERN. */
564 const char *p1;
565
566 /* Keeps track of unclosed groups. */
567 compile_stack_type compile_stack;
568
569 /* Points to the current (ending) position in the pattern. */
570 const char *p;
571 const char *pend;
572
573 /* When parsing is done, this will hold the expression tree. */
574 struct rexp_node * rexp;
575
576 /* This and top_expression are saved on the compile stack. */
577 struct rexp_node ** top_expression;
578 struct rexp_node ** last_non_regular_expression;
579 struct rexp_node ** last_expression;
580
581 /* Parameter to `goto append_node' */
582 struct rexp_node * append;
583
584 /* Counts open-groups as they are encountered. This is the index of the
585 * innermost group being compiled.
586 */
587 regnum_t regnum;
588
589 /* True iff the sub-expression just started
590 * is purely syntactic. Otherwise, a regmatch data
591 * slot is allocated for the subexpression.
592 */
593 int syntax_only_parens;
594
595 /* Place in the uncompiled pattern (i.e., the {) to
596 * which to go back if the interval is invalid.
597 */
598 const char *beg_interval;
599
600 int side;
601
602\f
603
604 if (!translate)
605 translate = rx_id_translation;
606
607 /* Points to the current (ending) position in the pattern. */
608 p = pattern;
609 pend = pattern + size;
610
611 /* When parsing is done, this will hold the expression tree. */
612 rexp = 0;
613
614 /* In the midst of compilation, this holds onto the regexp
615 * first parst while rexp goes on to aquire additional constructs.
616 */
617 top_expression = &rexp;
618 last_non_regular_expression = top_expression;
619 last_expression = top_expression;
620
621 /* Counts open-groups as they are encountered. This is the index of the
622 * innermost group being compiled.
623 */
624 regnum = 0;
625
626 rx_bzero ((char *)validate_inv_tr, sizeof (validate_inv_tr));
627
628
629 /* Initialize the compile stack. */
630 compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
631 if (compile_stack.stack == 0)
632 return REG_ESPACE;
633
634 compile_stack.size = INIT_COMPILE_STACK_SIZE;
635 compile_stack.avail = 0;
636
637#if !defined (emacs) && !defined (SYNTAX_TABLE)
638 /* Initialize the syntax table. */
639 init_syntax_once ();
640#endif
641
642 /* Loop through the uncompiled pattern until we're at the end. */
643 while (p != pend)
644 {
645 PATFETCH (c);
646
647 switch (c)
648 {
649 case '^':
650 {
651 if ( /* If at start of pattern, it's an operator. */
652 p == pattern + 1
653 /* If context independent, it's an operator. */
654 || syntax & RE_CONTEXT_INDEP_ANCHORS
655 /* Otherwise, depends on what's come before. */
656 || at_begline_loc_p (pattern, p, syntax))
657 {
658 struct rexp_node * n
659 = rx_mk_r_int (r_context, '^');
660 if (!n)
661 goto space_error;
662 append = n;
663 goto append_node;
664 }
665 else
666 goto normal_char;
667 }
668 break;
669
670
671 case '$':
672 {
673 if ( /* If at end of pattern, it's an operator. */
674 p == pend
675 /* If context independent, it's an operator. */
676 || syntax & RE_CONTEXT_INDEP_ANCHORS
677 /* Otherwise, depends on what's next. */
678 || at_endline_loc_p (p, pend, syntax))
679 {
680 struct rexp_node * n
681 = rx_mk_r_int (r_context, '$');
682 if (!n)
683 goto space_error;
684 append = n;
685 goto append_node;
686 }
687 else
688 goto normal_char;
689 }
690 break;
691
692
693 case '+':
694 case '?':
695 if ((syntax & RE_BK_PLUS_QM)
696 || (syntax & RE_LIMITED_OPS))
697 goto normal_char;
698
699 handle_plus:
700 case '*':
701 /* If there is no previous pattern... */
702 if (pointless_if_repeated (*last_expression))
703 {
704 if (syntax & RE_CONTEXT_INVALID_OPS)
705 {
706 compile_error = REG_BADRPT;
707 goto error_return;
708 }
709 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
710 goto normal_char;
711 }
712
713 {
714 /* 1 means zero (many) matches is allowed. */
715 char zero_times_ok = 0, many_times_ok = 0;
716
717 /* If there is a sequence of repetition chars, collapse it
718 down to just one (the right one). We can't combine
719 interval operators with these because of, e.g., `a{2}*',
720 which should only match an even number of `a's. */
721
722 for (;;)
723 {
724 zero_times_ok |= c != '+';
725 many_times_ok |= c != '?';
726
727 if (p == pend)
728 break;
729
730 PATFETCH (c);
731
732 if (c == '*'
733 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
734 ;
735
736 else if (syntax & RE_BK_PLUS_QM && c == '\\')
737 {
738 if (p == pend)
739 {
740 compile_error = REG_EESCAPE;
741 goto error_return;
742 }
743
744 PATFETCH (c1);
745 if (!(c1 == '+' || c1 == '?'))
746 {
747 PATUNFETCH;
748 PATUNFETCH;
749 break;
750 }
751
752 c = c1;
753 }
754 else
755 {
756 PATUNFETCH;
757 break;
758 }
759
760 /* If we get here, we found another repeat character. */
761 }
762
763 /* Now we know whether or not zero matches is allowed
764 * and also whether or not two or more matches is allowed.
765 */
766
767 {
768 struct rexp_node * inner_exp;
769 struct rexp_node * star;
770
771 if (*last_expression && ((*last_expression)->type == r_string))
772 if (factor_string (&last_expression, cset_size))
773 goto space_error;
774 inner_exp = *last_expression;
775 star = rx_mk_r_monop ((many_times_ok
776 ? (zero_times_ok ? r_star : r_plus)
777 : r_opt),
778 inner_exp);
779 if (!star)
780 goto space_error;
781 *last_expression = star;
782 }
783 }
784 break;
785
786
787 case '.':
788 {
789 rx_Bitset cs;
790 struct rexp_node * n;
791 cs = rx_cset (cset_size);
792 if (!cs)
793 goto space_error;
794 n = rx_mk_r_cset (r_cset, cset_size, cs);
795 if (!n)
796 {
797 rx_free_cset (cs);
798 goto space_error;
799 }
800 rx_bitset_universe (cset_size, cs);
801 if (!(syntax & RE_DOT_NEWLINE))
802 RX_bitset_remove (cs, '\n');
803 if (syntax & RE_DOT_NOT_NULL)
804 RX_bitset_remove (cs, 0);
805
806 append = n;
807 goto append_node;
808 break;
809 }
810
811
812 case '[':
813 if (p == pend)
814 {
815 compile_error = REG_EBRACK;
816 goto error_return;
817 }
818 {
819 int had_char_class;
820 rx_Bitset cs;
821 struct rexp_node * node;
822 int is_inverted;
823
824 had_char_class = 0;
825 is_inverted = *p == '^';
826 cs = rx_cset (cset_size);
827 if (!cs)
828 goto space_error;
829 node = rx_mk_r_cset (r_cset, cset_size ,cs);
830 if (!node)
831 {
832 rx_free_cset (cs);
833 goto space_error;
834 }
835
836 /* This branch of the switch is normally exited with
837 *`goto append_node'
838 */
839 append = node;
840
841 if (is_inverted)
842 p++;
843
844 /* Remember the first position in the bracket expression. */
845 p1 = p;
846
847 /* Read in characters and ranges, setting map bits. */
848 for (;;)
849 {
850 if (p == pend)
851 {
852 compile_error = REG_EBRACK;
853 goto error_return;
854 }
855
856 PATFETCH (c);
857
858 /* \ might escape characters inside [...] and [^...]. */
859 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
860 {
861 if (p == pend)
862 {
863 compile_error = REG_EESCAPE;
864 goto error_return;
865 }
866
867 PATFETCH (c1);
868 {
869 rx_Bitset it = inverse_translation (n_members,
870 cset_size,
871 validate_inv_tr,
872 inverse_translate,
873 translate,
874 c1);
875 rx_bitset_union (cset_size, cs, it);
876 }
877 continue;
878 }
879
880 /* Could be the end of the bracket expression. If it's
881 not (i.e., when the bracket expression is `[]' so
882 far), the ']' character bit gets set way below. */
883 if (c == ']' && p != p1 + 1)
884 goto finalize_class_and_append;
885
886 /* Look ahead to see if it's a range when the last thing
887 was a character class. */
888 if (had_char_class && c == '-' && *p != ']')
889 {
890 compile_error = REG_ERANGE;
891 goto error_return;
892 }
893
894 /* Look ahead to see if it's a range when the last thing
895 was a character: if this is a hyphen not at the
896 beginning or the end of a list, then it's the range
897 operator. */
898 if (c == '-'
899 && !(p - 2 >= pattern && p[-2] == '[')
900 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
901 && *p != ']')
902 {
903 int ret
904 = compile_range (n_members, cset_size, cs, &p, pend, translate, syntax,
905 inverse_translate, validate_inv_tr);
906 if (ret != REG_NOERROR)
907 {
908 compile_error = ret;
909 goto error_return;
910 }
911 }
912
913 else if (p[0] == '-' && p[1] != ']')
914 { /* This handles ranges made up of characters only. */
915 int ret;
916
917 /* Move past the `-'. */
918 PATFETCH (c1);
919
920 ret = compile_range (n_members, cset_size, cs, &p, pend, translate, syntax,
921 inverse_translate, validate_inv_tr);
922 if (ret != REG_NOERROR)
923 {
924 compile_error = ret;
925 goto error_return;
926 }
927 }
928
929 /* See if we're at the beginning of a possible character
930 class. */
931
932 else if ((syntax & RE_CHAR_CLASSES)
933 && (c == '[') && (*p == ':'))
934 {
935 char str[CHAR_CLASS_MAX_LENGTH + 1];
936
937 PATFETCH (c);
938 c1 = 0;
939
940 /* If pattern is `[[:'. */
941 if (p == pend)
942 {
943 compile_error = REG_EBRACK;
944 goto error_return;
945 }
946
947 for (;;)
948 {
949 PATFETCH (c);
950 if (c == ':' || c == ']' || p == pend
951 || c1 == CHAR_CLASS_MAX_LENGTH)
952 break;
953 str[c1++] = c;
954 }
955 str[c1] = '\0';
956
957 /* If isn't a word bracketed by `[:' and:`]':
958 undo the ending character, the letters, and leave
959 the leading `:' and `[' (but set bits for them). */
960 if (c == ':' && *p == ']')
961 {
962 if (!strncmp (str, "cut", 3))
963 {
964 int val;
965 if (1 != sscanf (str + 3, " %d", &val))
966 {
967 compile_error = REG_ECTYPE;
968 goto error_return;
969 }
970 /* Throw away the ]] */
971 PATFETCH (c);
972 PATFETCH (c);
973 {
974 struct rexp_node * cut;
975 cut = rx_mk_r_int (r_cut, val);
976 append = cut;
977 goto append_node;
978 }
979 }
980 else if (!strncmp (str, "(", 1))
981 {
982 /* Throw away the ]] */
983 PATFETCH (c);
984 PATFETCH (c);
985 syntax_only_parens = 1;
986 goto handle_open;
987 }
988 else if (!strncmp (str, ")", 1))
989 {
990 /* Throw away the ]] */
991 PATFETCH (c);
992 PATFETCH (c);
993 syntax_only_parens = 1;
994 goto handle_close;
995 }
996 else
997 {
998 int ch;
999 int is_alnum = !strcmp (str, "alnum");
1000 int is_alpha = !strcmp (str, "alpha");
1001 int is_blank = !strcmp (str, "blank");
1002 int is_cntrl = !strcmp (str, "cntrl");
1003 int is_digit = !strcmp (str, "digit");
1004 int is_graph = !strcmp (str, "graph");
1005 int is_lower = !strcmp (str, "lower");
1006 int is_print = !strcmp (str, "print");
1007 int is_punct = !strcmp (str, "punct");
1008 int is_space = !strcmp (str, "space");
1009 int is_upper = !strcmp (str, "upper");
1010 int is_xdigit = !strcmp (str, "xdigit");
1011
1012 if (!IS_CHAR_CLASS (str))
1013 {
1014 compile_error = REG_ECTYPE;
1015 goto error_return;
1016 }
1017
1018 /* Throw away the ] at the end of the character
1019 class. */
1020 PATFETCH (c);
1021
1022 if (p == pend) { compile_error = REG_EBRACK; goto error_return; }
1023
1024 for (ch = 0; ch < 1 << CHARBITS; ch++)
1025 {
1026 if ( (is_alnum && isalnum (ch))
1027 || (is_alpha && isalpha (ch))
1028 || (is_blank && isa_blank (ch))
1029 || (is_cntrl && iscntrl (ch))
1030 || (is_digit && isdigit (ch))
1031 || (is_graph && isgraph (ch))
1032 || (is_lower && islower (ch))
1033 || (is_print && isprint (ch))
1034 || (is_punct && ispunct (ch))
1035 || (is_space && isspace (ch))
1036 || (is_upper && isupper (ch))
1037 || (is_xdigit && isxdigit (ch)))
1038 {
1039 rx_Bitset it =
1040 inverse_translation (n_members,
1041 cset_size,
1042 validate_inv_tr,
1043 inverse_translate,
1044 translate,
1045 ch);
1046 rx_bitset_union (cset_size,
1047 cs, it);
1048 }
1049 }
1050 had_char_class = 1;
1051 }
1052 }
1053 else
1054 {
1055 c1++;
1056 while (c1--)
1057 PATUNFETCH;
1058 {
1059 rx_Bitset it =
1060 inverse_translation (n_members,
1061 cset_size,
1062 validate_inv_tr,
1063 inverse_translate,
1064 translate,
1065 '[');
1066 rx_bitset_union (cset_size,
1067 cs, it);
1068 }
1069 {
1070 rx_Bitset it =
1071 inverse_translation (n_members,
1072 cset_size,
1073 validate_inv_tr,
1074 inverse_translate,
1075 translate,
1076 ':');
1077 rx_bitset_union (cset_size,
1078 cs, it);
1079 }
1080 had_char_class = 0;
1081 }
1082 }
1083 else
1084 {
1085 had_char_class = 0;
1086 {
1087 rx_Bitset it = inverse_translation (n_members,
1088 cset_size,
1089 validate_inv_tr,
1090 inverse_translate,
1091 translate,
1092 c);
1093 rx_bitset_union (cset_size, cs, it);
1094 }
1095 }
1096 }
1097
1098 finalize_class_and_append:
1099 if (is_inverted)
1100 {
1101 rx_bitset_complement (cset_size, cs);
1102 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
1103 RX_bitset_remove (cs, '\n');
1104 }
1105 goto append_node;
1106 }
1107 break;
1108
1109
1110 case '(':
1111 if (syntax & RE_NO_BK_PARENS)
1112 {
1113 syntax_only_parens = 0;
1114 goto handle_open;
1115 }
1116 else
1117 goto normal_char;
1118
1119
1120 case ')':
1121 if (syntax & RE_NO_BK_PARENS)
1122 {
1123 syntax_only_parens = 0;
1124 goto handle_close;
1125 }
1126 else
1127 goto normal_char;
1128
1129
1130 case '\n':
1131 if (syntax & RE_NEWLINE_ALT)
1132 goto handle_alt;
1133 else
1134 goto normal_char;
1135
1136
1137 case '|':
1138 if (syntax & RE_NO_BK_VBAR)
1139 goto handle_alt;
1140 else
1141 goto normal_char;
1142
1143
1144 case '{':
1145 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1146 goto handle_interval;
1147 else
1148 goto normal_char;
1149
1150
1151 case '\\':
1152 if (p == pend) { compile_error = REG_EESCAPE; goto error_return; }
1153
1154 /* Do not translate the character after the \, so that we can
1155 distinguish, e.g., \B from \b, even if we normally would
1156 translate, e.g., B to b. */
1157 PATFETCH_RAW (c);
1158
1159 switch (c)
1160 {
1161 case '(':
1162 if (syntax & RE_NO_BK_PARENS)
1163 goto normal_backslash;
1164
1165 syntax_only_parens = 0;
1166
1167 handle_open:
1168 if (!syntax_only_parens)
1169 regnum++;
1170
1171 if (COMPILE_STACK_FULL)
1172 {
1173 compile_stack.stack
1174 = ((compile_stack_elt_t *)
1175 realloc (compile_stack.stack,
1176 (compile_stack.size << 1) * sizeof (compile_stack_elt_t)));
1177 if (compile_stack.stack == 0)
1178 goto space_error;
1179 compile_stack.size <<= 1;
1180 }
1181
1182 if (*last_non_regular_expression)
1183 {
1184 struct rexp_node * concat;
1185 concat = rx_mk_r_binop (r_concat, *last_non_regular_expression, 0);
1186 if (!concat)
1187 goto space_error;
1188 *last_non_regular_expression = concat;
1189 last_non_regular_expression = &concat->params.pair.right;
1190 last_expression = last_non_regular_expression;
1191 }
1192
1193 /*
1194 * These are the values to restore when we hit end of this
1195 * group.
1196 */
1197 COMPILE_STACK_TOP.top_expression = top_expression;
1198 COMPILE_STACK_TOP.last_expression = last_expression;
1199 COMPILE_STACK_TOP.last_non_regular_expression = last_non_regular_expression;
1200
1201 if (syntax_only_parens)
1202 COMPILE_STACK_TOP.regnum = -1;
1203 else
1204 COMPILE_STACK_TOP.regnum = regnum;
1205
1206 compile_stack.avail++;
1207
1208 top_expression = last_non_regular_expression;
1209 break;
1210
1211
1212 case ')':
1213 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1214 syntax_only_parens = 0;
1215
1216 handle_close:
1217 /* See similar code for backslashed left paren above. */
1218 if (COMPILE_STACK_EMPTY)
1219 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1220 goto normal_char;
1221 else
1222 { compile_error = REG_ERPAREN; goto error_return; }
1223
1224 /* Since we just checked for an empty stack above, this
1225 * ``can't happen''.
1226 */
1227
1228 {
1229 /* We don't just want to restore into `regnum', because
1230 * later groups should continue to be numbered higher,
1231 * as in `(ab)c(de)' -- the second group is #2.
1232 */
1233 regnum_t this_group_regnum;
1234 struct rexp_node ** inner;
1235 struct rexp_node * parens;
1236
1237 inner = top_expression;
1238 compile_stack.avail--;
1239
1240 if (!!syntax_only_parens != (COMPILE_STACK_TOP.regnum == -1))
1241 { compile_error = REG_ERPAREN; goto error_return; }
1242
1243 top_expression = COMPILE_STACK_TOP.top_expression;
1244 last_expression = COMPILE_STACK_TOP.last_expression;
1245 last_non_regular_expression = COMPILE_STACK_TOP.last_non_regular_expression;
1246 this_group_regnum = COMPILE_STACK_TOP.regnum;
1247
1248 {
1249 parens = rx_mk_r_monop (r_parens, *inner);
1250 if (!parens)
1251 goto space_error;
1252 parens->params.intval = this_group_regnum;
1253 *inner = parens;
1254 break;
1255 }
1256 }
1257
1258 case '|': /* `\|'. */
1259 if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
1260 goto normal_backslash;
1261 handle_alt:
1262 if (syntax & RE_LIMITED_OPS)
1263 goto normal_char;
1264
1265 {
1266 struct rexp_node * alt;
1267
1268 alt = rx_mk_r_binop (r_alternate, *top_expression, 0);
1269 if (!alt)
1270 goto space_error;
1271 *top_expression = alt;
1272 last_expression = &alt->params.pair.right;
1273 last_non_regular_expression = &alt->params.pair.right;
1274 }
1275 break;
1276
1277
1278 case '{':
1279 /* If \{ is a literal. */
1280 if (!(syntax & RE_INTERVALS)
1281 /* If we're at `\{' and it's not the open-interval
1282 operator. */
1283 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1284 || (p - 2 == pattern && p == pend))
1285 goto normal_backslash;
1286
1287 handle_interval:
1288 {
1289 /* If got here, then the syntax allows intervals.
1290 */
1291
1292 /* At least (most) this many matches must be made.
1293 */
1294 int lower_bound;
1295 int upper_bound;
1296
1297 lower_bound = -1;
1298 upper_bound = -1;
1299
1300 /* We're about to parse the bounds of the interval.
1301 * It may turn out that this isn't an interval after
1302 * all, in which case these same characters will have
1303 * to be reparsed as literals. This remembers
1304 * the backtrack point in the parse:
1305 */
1306 beg_interval = p - 1;
1307
1308 if (p == pend)
1309 {
1310 if (syntax & RE_NO_BK_BRACES)
1311 goto unfetch_interval;
1312 else
1313 { compile_error = REG_EBRACE; goto error_return; }
1314 }
1315
1316 GET_UNSIGNED_NUMBER (lower_bound);
1317
1318 if (c == ',')
1319 {
1320 GET_UNSIGNED_NUMBER (upper_bound);
1321 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1322 }
1323 else
1324 /* Interval such as `{n}' => match exactly n times.
1325 */
1326 upper_bound = lower_bound;
1327
1328 if (lower_bound < 0
1329 || upper_bound > RE_DUP_MAX
1330 || lower_bound > upper_bound)
1331 {
1332 if (syntax & RE_NO_BK_BRACES)
1333 goto unfetch_interval;
1334 else
1335 { compile_error = REG_BADBR; goto error_return; }
1336 }
1337
1338 if (!(syntax & RE_NO_BK_BRACES))
1339 {
1340 if (c != '\\') { compile_error = REG_EBRACE; goto error_return; }
1341 PATFETCH (c);
1342 }
1343
1344 if (c != '}')
1345 {
1346 if (syntax & RE_NO_BK_BRACES)
1347 goto unfetch_interval;
1348 else
1349 { compile_error = REG_BADBR; goto error_return; }
1350 }
1351
1352 /* We just parsed a valid interval.
1353 * lower_bound and upper_bound are set.
1354 */
1355
1356 /* If it's invalid to have no preceding re.
1357 */
1358 if (pointless_if_repeated (*last_expression))
1359 {
1360 if (syntax & RE_CONTEXT_INVALID_OPS)
1361 { compile_error = REG_BADRPT; goto error_return; }
1362 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1363 /* treat the interval spec as literal chars. */
1364 goto unfetch_interval;
1365 }
1366
1367 {
1368 struct rexp_node * interval;
1369
1370 if (*last_expression && ((*last_expression)->type == r_string))
1371 if (factor_string (&last_expression, cset_size))
1372 goto space_error;
1373 interval = rx_mk_r_monop (r_interval, *last_expression);
1374 if (!interval)
1375 goto space_error;
1376 interval->params.intval = lower_bound;
1377 interval->params.intval2 = upper_bound;
1378 *last_expression = interval;
1379 last_non_regular_expression = last_expression;
1380 }
1381 beg_interval = 0;
1382 }
1383 break;
1384
1385 unfetch_interval:
1386 /* If an invalid interval, match the characters as literals. */
1387 p = beg_interval;
1388 beg_interval = 0;
1389
1390 /* normal_char and normal_backslash need `c'. */
1391 PATFETCH (c);
1392
1393 if (!(syntax & RE_NO_BK_BRACES))
1394 {
1395 if ((p > pattern) && (p[-1] == '\\'))
1396 goto normal_backslash;
1397 }
1398 goto normal_char;
1399
1400#ifdef emacs
1401 /* There is no way to specify the before_dot and after_dot
1402 * operators. rms says this is ok. --karl
1403 */
1404 case '=':
1405 side = '=';
1406 goto add_side_effect;
1407 break;
1408
1409 case 's':
1410 case 'S':
1411 {
1412 rx_Bitset cs;
1413 struct rexp_node * set;
1414
1415 cs = rx_cset (&cset_size);
1416 if (!cs)
1417 goto space_error;
1418 set = rx_mk_r_cset (r_cset, cset_size, cs);
1419 if (!set)
1420 {
1421 rx_free_cset (cs);
1422 goto space_error;
1423 }
1424 if (c == 'S')
1425 rx_bitset_universe (cset_size, cs);
1426
1427 PATFETCH (c);
1428 {
1429 int x;
1430 enum syntaxcode code = syntax_spec_code [c];
1431 for (x = 0; x < 256; ++x)
1432 {
1433
1434 if (SYNTAX (x) == code)
1435 {
1436 rx_Bitset it =
1437 inverse_translation (n_members,
1438 cset_size, validate_inv_tr,
1439 inverse_translate,
1440 translate, x);
1441 rx_bitset_xor (cset_size, cs, it);
1442 }
1443 }
1444 }
1445 append = set;
1446 goto append_node;
1447 }
1448 break;
1449#endif /* emacs */
1450
1451
1452 case 'w':
1453 case 'W':
1454 {
1455 rx_Bitset cs;
1456 struct rexp_node * n;
1457
1458 cs = rx_cset (cset_size);
1459 n = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0);
1460 if (!(cs && n))
1461 {
1462 if (cs)
1463 rx_free_cset (cs);
1464 goto space_error;
1465 }
1466 if (c == 'W')
1467 rx_bitset_universe (cset_size ,cs);
1468 {
1469 int x;
1470 for (x = cset_size - 1; x > 0; --x)
1471 if (SYNTAX(x) & Sword)
1472 RX_bitset_toggle (cs, x);
1473 }
1474 append = n;
1475 goto append_node;
1476 }
1477 break;
1478
1479 case '<':
1480 side = '<';
1481 goto add_side_effect;
1482 break;
1483
1484 case '>':
1485 side = '>';
1486 goto add_side_effect;
1487 break;
1488
1489 case 'b':
1490 side = 'b';
1491 goto add_side_effect;
1492 break;
1493
1494 case 'B':
1495 side = 'B';
1496 goto add_side_effect;
1497 break;
1498
1499 case '`':
1500 side = '`';
1501 goto add_side_effect;
1502 break;
1503
1504 case '\'':
1505 side = '\'';
1506 goto add_side_effect;
1507 break;
1508
1509 add_side_effect:
1510 {
1511 struct rexp_node * se;
1512 se = rx_mk_r_int (r_context, side);
1513 if (!se)
1514 goto space_error;
1515 append = se;
1516 goto append_node;
1517 }
1518 break;
1519
1520 case '1': case '2': case '3': case '4': case '5':
1521 case '6': case '7': case '8': case '9':
1522 if (syntax & RE_NO_BK_REFS)
1523 goto normal_char;
1524
1525 c1 = c - '0';
1526
1527 /* Can't back reference to a subexpression if inside of it. */
1528 if (group_in_compile_stack (compile_stack, c1))
1529 goto normal_char;
1530
1531 if (c1 > regnum)
1532 { compile_error = REG_ESUBREG; goto error_return; }
1533
1534 side = c;
1535 goto add_side_effect;
1536 break;
1537
1538 case '+':
1539 case '?':
1540 if (syntax & RE_BK_PLUS_QM)
1541 goto handle_plus;
1542 else
1543 goto normal_backslash;
1544
1545 default:
1546 normal_backslash:
1547 /* You might think it would be useful for \ to mean
1548 * not to translate; but if we don't translate it
1549 * it will never match anything.
1550 */
1551 c = TRANSLATE (c);
1552 goto normal_char;
1553 }
1554 break;
1555
1556
1557 default:
1558 /* Expects the character in `c'. */
1559 normal_char:
1560 {
1561 rx_Bitset cs;
1562 struct rexp_node * match;
1563 rx_Bitset it;
1564
1565 it = inverse_translation (n_members,
1566 cset_size, validate_inv_tr,
1567 inverse_translate, translate, c);
1568
1569 if (1 != n_members[c])
1570 {
1571 cs = rx_cset (cset_size);
1572 match = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0);
1573 if (!(cs && match))
1574 {
1575 if (cs)
1576 rx_free_cset (cs);
1577 goto space_error;
1578 }
1579 rx_bitset_union (CHAR_SET_SIZE, cs, it);
1580 append = match;
1581 goto append_node;
1582 }
1583 else
1584 {
1585 if (*last_expression && (*last_expression)->type == r_string)
1586 {
1587 if (rx_adjoin_string (&((*last_expression)->params.cstr), c))
1588 goto space_error;
1589 break;
1590 }
1591 else
1592 {
1593 append = rx_mk_r_str (r_string, c);
1594 if(!append)
1595 goto space_error;
1596 goto append_node;
1597 }
1598 }
1599 break;
1600
1601 append_node:
1602 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
1603 * and then parses the next character normally.
1604 */
1605 if (RX_regular_node_type (append->type))
1606 {
1607 if (!*last_expression)
1608 *last_expression = append;
1609 else
1610 {
1611 struct rexp_node * concat;
1612 concat = rx_mk_r_binop (r_concat,
1613 *last_expression, append);
1614 if (!concat)
1615 goto space_error;
1616 *last_expression = concat;
1617 last_expression = &concat->params.pair.right;
1618 }
1619 }
1620 else
1621 {
1622 if (!*last_non_regular_expression)
1623 {
1624 *last_non_regular_expression = append;
1625 last_expression = last_non_regular_expression;
1626 }
1627 else
1628 {
1629 struct rexp_node * concat;
1630 concat = rx_mk_r_binop (r_concat,
1631 *last_non_regular_expression, append);
1632 if (!concat)
1633 goto space_error;
1634 *last_non_regular_expression = concat;
1635 last_non_regular_expression = &concat->params.pair.right;
1636 last_expression = last_non_regular_expression;
1637 }
1638 }
1639 }
1640 } /* switch (c) */
1641 } /* while p != pend */
1642
1643
1644 /* Through the pattern now. */
1645
1646 if (!COMPILE_STACK_EMPTY)
1647 { compile_error = REG_EPAREN; goto error_return; }
1648 free (compile_stack.stack);
1649
1650
1651 *rexp_p = rexp;
1652 return REG_NOERROR;
1653
1654 space_error:
1655 compile_error = REG_ESPACE;
1656
1657 error_return:
1658 free (compile_stack.stack);
1659 /* Free expressions pushed onto the compile stack! */
1660 if (rexp)
1661 rx_free_rexp (rexp);
1662 return compile_error;
1663}
1664
1665