1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU Library General Public License as published by
5 * the Free Software Foundation; either version 2, or (at your option)
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU Library General Public License for more details.
13 * You should have received a copy of the GNU Library General Public License
14 * along with this software; see the file COPYING. If not, write to
15 * the Free Software Foundation, 59 Temple Place - Suite 330,
16 * Boston, MA 02111-1307, USA.
20 #include <sys/types.h>
22 #include "rxgnucomp.h"
23 #include "inst-rxposix.h"
29 /* Define the syntax basics for \<, \>, etc.
34 #define CHAR_SET_SIZE (1 << CHARBITS)
36 #define SYNTAX(c) re_syntax_table[c]
37 char re_syntax_table
[CHAR_SET_SIZE
];
41 init_syntax_once (void)
53 rx_bzero ((char *)re_syntax_table
, sizeof re_syntax_table
);
55 for (c
= 'a'; c
<= 'z'; c
++)
56 re_syntax_table
[c
] = Sword
;
58 for (c
= 'A'; c
<= 'Z'; c
++)
59 re_syntax_table
[c
] = Sword
;
61 for (c
= '0'; c
<= '9'; c
++)
62 re_syntax_table
[c
] = Sword
;
64 re_syntax_table
['_'] = Sword
;
68 #endif /* not emacs */
74 const char *rx_error_msg
[] =
77 "No match", /* REG_NOMATCH */
78 "Invalid regular expression", /* REG_BADPAT */
79 "Invalid collation character", /* REG_ECOLLATE */
80 "Invalid character class name", /* REG_ECTYPE */
81 "Trailing backslash", /* REG_EESCAPE */
82 "Invalid back reference", /* REG_ESUBREG */
83 "Unmatched [ or [^", /* REG_EBRACK */
84 "Unmatched ( or \\(", /* REG_EPAREN */
85 "Unmatched \\{", /* REG_EBRACE */
86 "Invalid content of \\{\\}", /* REG_BADBR */
87 "Invalid range end", /* REG_ERANGE */
88 "Memory exhausted", /* REG_ESPACE */
89 "Invalid preceding regular expression", /* REG_BADRPT */
90 "Premature end of regular expression", /* REG_EEND */
91 "Regular expression too big", /* REG_ESIZE */
92 "Unmatched ) or \\)", /* REG_ERPAREN */
98 * Macros used while compiling patterns.
100 * By convention, PEND points just past the end of the uncompiled pattern,
101 * P points to the read position in the pattern. `translate' is the name
102 * of the translation table (`TRANSLATE' is the name of a macro that looks
103 * things up in `translate').
108 * Fetch the next character in the uncompiled pattern---translating it
109 * if necessary. *Also cast from a signed character in the constant
110 * string passed to us by the user to an unsigned char that we can use
111 * as an array index (in, e.g., `translate').
113 #define PATFETCH(c) \
114 do {if (p == pend) return REG_EEND; \
115 c = (unsigned char) *p++; \
120 * Fetch the next character in the uncompiled pattern, with no
123 #define PATFETCH_RAW(c) \
124 do {if (p == pend) return REG_EEND; \
125 c = (unsigned char) *p++; \
128 /* Go backwards one character in the pattern. */
129 #define PATUNFETCH p--
132 #define TRANSLATE(d) translate[(unsigned char) (d)]
134 typedef int regnum_t
;
136 /* Since offsets can go either forwards or backwards, this type needs to
137 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
139 typedef int pattern_offset_t
;
143 struct rexp_node
** top_expression
;
144 struct rexp_node
** last_expression
;
145 struct rexp_node
** last_non_regular_expression
;
146 pattern_offset_t inner_group_offset
;
148 } compile_stack_elt_t
;
152 compile_stack_elt_t
*stack
;
154 unsigned avail
; /* Offset of next open position. */
155 } compile_stack_type
;
158 #define INIT_COMPILE_STACK_SIZE 32
160 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
161 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
163 /* The next available element. */
164 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
167 /* Set the bit for character C in a list. */
168 #define SET_LIST_BIT(c) \
169 (b[((unsigned char) (c)) / CHARBITS] \
170 |= 1 << (((unsigned char) c) % CHARBITS))
172 /* Get the next unsigned number in the uncompiled pattern. */
173 #define GET_UNSIGNED_NUMBER(num) \
177 while (isdigit (c)) \
181 num = num * 10 + c - '0'; \
189 #define CHAR_CLASS_MAX_LENGTH 64
191 #define IS_CHAR_CLASS(string) \
192 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
193 || !strcmp (string, "lower") || !strcmp (string, "digit") \
194 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
195 || !strcmp (string, "space") || !strcmp (string, "print") \
196 || !strcmp (string, "punct") || !strcmp (string, "graph") \
197 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
200 /* These predicates are used in regex_compile. */
202 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
203 * after an alternative or a begin-subexpression. We assume there is at
204 * least one character before the ^.
209 at_begline_loc_p (const char *pattern
, const char * p
, unsigned long syntax
)
212 at_begline_loc_p (pattern
, p
, syntax
)
215 unsigned long syntax
;
218 const char *prev
= p
- 2;
219 int prev_prev_backslash
= ((prev
> pattern
) && (prev
[-1] == '\\'));
223 (/* After a subexpression? */
224 ((*prev
== '(') && ((syntax
& RE_NO_BK_PARENS
) || prev_prev_backslash
))
226 /* After an alternative? */
227 ((*prev
== '|') && ((syntax
& RE_NO_BK_VBAR
) || prev_prev_backslash
))
231 /* The dual of at_begline_loc_p. This one is for $. We assume there is
232 * at least one character after the $, i.e., `P < PEND'.
237 at_endline_loc_p (const char *p
, const char *pend
, int syntax
)
240 at_endline_loc_p (p
, pend
, syntax
)
246 const char *next
= p
;
247 int next_backslash
= (*next
== '\\');
248 const char *next_next
= (p
+ 1 < pend
) ? (p
+ 1) : 0;
252 /* Before a subexpression? */
253 ((syntax
& RE_NO_BK_PARENS
)
255 : (next_backslash
&& next_next
&& (*next_next
== ')')))
257 /* Before an alternative? */
258 ((syntax
& RE_NO_BK_VBAR
)
260 : (next_backslash
&& next_next
&& (*next_next
== '|')))
265 unsigned char rx_id_translation
[256] =
267 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
268 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
269 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
270 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
271 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
272 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
273 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
274 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
275 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
276 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
278 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
279 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
280 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
281 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
282 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
283 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
284 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
285 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
286 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
287 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
289 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
290 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
291 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
292 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
293 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
294 250, 251, 252, 253, 254, 255
297 /* The compiler keeps an inverted translation table.
298 * This looks up/inititalize elements.
299 * VALID is an array of booleans that validate CACHE.
304 inverse_translation (int * n_members
, int cset_size
, char * valid
, rx_Bitset cache
, unsigned char * translate
, int c
)
307 inverse_translation (n_members
, cset_size
, valid
, cache
, translate
, c
)
312 unsigned char * translate
;
317 cs
= cache
+ c
* rx_bitset_numb_subsets (cset_size
);
326 rx_bitset_null (cset_size
, cs
);
328 for (x
= 0; x
< 256; ++x
)
329 if (TRANSLATE(x
) == c_tr
)
331 RX_bitset_enjoin (cs
, x
);
335 n_members
[c
] = membs
;
343 /* More subroutine declarations and macros for regex_compile. */
345 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
351 group_in_compile_stack (compile_stack_type compile_stack
, regnum_t regnum
)
354 group_in_compile_stack (compile_stack
, regnum
)
355 compile_stack_type compile_stack
;
361 for (this_element
= compile_stack
.avail
- 1;
364 if (compile_stack
.stack
[this_element
].regnum
== regnum
)
372 * Read the ending character of a range (in a bracket expression) from the
373 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
374 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
375 * Then we set the translation of all bits between the starting and
376 * ending characters (inclusive) in the compiled pattern B.
378 * Return an error code.
380 * We use these short variable names so we can use the same macros as
381 * `regex_compile' itself.
386 compile_range (int * n_members
, int cset_size
, rx_Bitset cs
, const char ** p_ptr
, const char * pend
, unsigned char * translate
, unsigned long syntax
, rx_Bitset inv_tr
, char * valid_inv_tr
)
389 compile_range (n_members
, cset_size
, cs
, p_ptr
, pend
, translate
, syntax
, inv_tr
, valid_inv_tr
)
395 unsigned char * translate
;
396 unsigned long syntax
;
403 const char *p
= *p_ptr
;
405 unsigned char range_end
;
406 unsigned char range_start
= TRANSLATE(p
[-2]);
411 PATFETCH (range_end
);
415 if (range_start
> range_end
)
416 return syntax
& RE_NO_EMPTY_RANGES
? REG_ERANGE
: REG_NOERROR
;
418 for (this_char
= range_start
; this_char
<= range_end
; this_char
++)
421 inverse_translation (n_members
, cset_size
, valid_inv_tr
, inv_tr
, translate
, this_char
);
422 rx_bitset_union (cset_size
, cs
, it
);
431 pointless_if_repeated (struct rexp_node
* node
)
434 pointless_if_repeated (node
)
435 struct rexp_node
* node
;
448 return (pointless_if_repeated (node
->params
.pair
.left
)
449 && pointless_if_repeated (node
->params
.pair
.right
));
454 return pointless_if_repeated (node
->params
.pair
.left
);
456 switch (node
->params
.intval
)
478 factor_string (struct rexp_node
*** lastp
, int cset_size
)
481 factor_string (lastp
, cset_size
)
482 struct rexp_node
*** lastp
;
486 struct rexp_node
** expp
;
487 struct rexp_node
* exp
;
489 struct rexp_node
* cset_node
;
492 exp
= *expp
; /* presumed r_string */
494 cs
= rx_cset (cset_size
);
497 RX_bitset_enjoin (cs
, exp
->params
.cstr
.contents
[exp
->params
.cstr
.len
- 1]);
498 cset_node
= rx_mk_r_cset (r_cset
, cset_size
, cs
);
504 if (exp
->params
.cstr
.len
== 1)
508 /* lastp remains the same */
513 struct rexp_node
* concat_node
;
514 exp
->params
.cstr
.len
--;
515 concat_node
= rx_mk_r_binop (r_concat
, exp
, cset_node
);
518 rx_free_rexp (cset_node
);
522 *lastp
= &concat_node
->params
.pair
.right
;
529 #define isa_blank(C) (((C) == ' ') || ((C) == '\t'))
533 rx_parse (struct rexp_node
** rexp_p
,
536 unsigned long syntax
,
538 unsigned char *translate
)
541 rx_parse (rexp_p
, pattern
, size
, syntax
, cset_size
, translate
)
542 struct rexp_node
** rexp_p
;
545 unsigned long syntax
;
547 unsigned char *translate
;
552 inverse_translate
[CHAR_SET_SIZE
* rx_bitset_numb_subsets(CHAR_SET_SIZE
)];
553 char validate_inv_tr
[CHAR_SET_SIZE
];
554 int n_members
[CHAR_SET_SIZE
];
556 /* We fetch characters from PATTERN here. Even though PATTERN is
557 * `char *' (i.e., signed), we declare these variables as unsigned, so
558 * they can be reliably used as array indices.
560 register unsigned char c
;
561 register unsigned char c1
;
563 /* A random tempory spot in PATTERN. */
566 /* Keeps track of unclosed groups. */
567 compile_stack_type compile_stack
;
569 /* Points to the current (ending) position in the pattern. */
573 /* When parsing is done, this will hold the expression tree. */
574 struct rexp_node
* rexp
;
576 /* This and top_expression are saved on the compile stack. */
577 struct rexp_node
** top_expression
;
578 struct rexp_node
** last_non_regular_expression
;
579 struct rexp_node
** last_expression
;
581 /* Parameter to `goto append_node' */
582 struct rexp_node
* append
;
584 /* Counts open-groups as they are encountered. This is the index of the
585 * innermost group being compiled.
589 /* True iff the sub-expression just started
590 * is purely syntactic. Otherwise, a regmatch data
591 * slot is allocated for the subexpression.
593 int syntax_only_parens
;
595 /* Place in the uncompiled pattern (i.e., the {) to
596 * which to go back if the interval is invalid.
598 const char *beg_interval
;
605 translate
= rx_id_translation
;
607 /* Points to the current (ending) position in the pattern. */
609 pend
= pattern
+ size
;
611 /* When parsing is done, this will hold the expression tree. */
614 /* In the midst of compilation, this holds onto the regexp
615 * first parst while rexp goes on to aquire additional constructs.
617 top_expression
= &rexp
;
618 last_non_regular_expression
= top_expression
;
619 last_expression
= top_expression
;
621 /* Counts open-groups as they are encountered. This is the index of the
622 * innermost group being compiled.
626 rx_bzero ((char *)validate_inv_tr
, sizeof (validate_inv_tr
));
629 /* Initialize the compile stack. */
630 compile_stack
.stack
= (( compile_stack_elt_t
*) malloc ((INIT_COMPILE_STACK_SIZE
) * sizeof ( compile_stack_elt_t
)));
631 if (compile_stack
.stack
== 0)
634 compile_stack
.size
= INIT_COMPILE_STACK_SIZE
;
635 compile_stack
.avail
= 0;
637 #if !defined (emacs) && !defined (SYNTAX_TABLE)
638 /* Initialize the syntax table. */
642 /* Loop through the uncompiled pattern until we're at the end. */
651 if ( /* If at start of pattern, it's an operator. */
653 /* If context independent, it's an operator. */
654 || syntax
& RE_CONTEXT_INDEP_ANCHORS
655 /* Otherwise, depends on what's come before. */
656 || at_begline_loc_p (pattern
, p
, syntax
))
659 = rx_mk_r_int (r_context
, '^');
673 if ( /* If at end of pattern, it's an operator. */
675 /* If context independent, it's an operator. */
676 || syntax
& RE_CONTEXT_INDEP_ANCHORS
677 /* Otherwise, depends on what's next. */
678 || at_endline_loc_p (p
, pend
, syntax
))
681 = rx_mk_r_int (r_context
, '$');
695 if ((syntax
& RE_BK_PLUS_QM
)
696 || (syntax
& RE_LIMITED_OPS
))
701 /* If there is no previous pattern... */
702 if (pointless_if_repeated (*last_expression
))
704 if (syntax
& RE_CONTEXT_INVALID_OPS
)
706 compile_error
= REG_BADRPT
;
709 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
714 /* 1 means zero (many) matches is allowed. */
715 char zero_times_ok
= 0, many_times_ok
= 0;
717 /* If there is a sequence of repetition chars, collapse it
718 down to just one (the right one). We can't combine
719 interval operators with these because of, e.g., `a{2}*',
720 which should only match an even number of `a's. */
724 zero_times_ok
|= c
!= '+';
725 many_times_ok
|= c
!= '?';
733 || (!(syntax
& RE_BK_PLUS_QM
) && (c
== '+' || c
== '?')))
736 else if (syntax
& RE_BK_PLUS_QM
&& c
== '\\')
740 compile_error
= REG_EESCAPE
;
745 if (!(c1
== '+' || c1
== '?'))
760 /* If we get here, we found another repeat character. */
763 /* Now we know whether or not zero matches is allowed
764 * and also whether or not two or more matches is allowed.
768 struct rexp_node
* inner_exp
;
769 struct rexp_node
* star
;
771 if (*last_expression
&& ((*last_expression
)->type
== r_string
))
772 if (factor_string (&last_expression
, cset_size
))
774 inner_exp
= *last_expression
;
775 star
= rx_mk_r_monop ((many_times_ok
776 ? (zero_times_ok
? r_star
: r_plus
)
781 *last_expression
= star
;
790 struct rexp_node
* n
;
791 cs
= rx_cset (cset_size
);
794 n
= rx_mk_r_cset (r_cset
, cset_size
, cs
);
800 rx_bitset_universe (cset_size
, cs
);
801 if (!(syntax
& RE_DOT_NEWLINE
))
802 RX_bitset_remove (cs
, '\n');
803 if (syntax
& RE_DOT_NOT_NULL
)
804 RX_bitset_remove (cs
, 0);
815 compile_error
= REG_EBRACK
;
821 struct rexp_node
* node
;
825 is_inverted
= *p
== '^';
826 cs
= rx_cset (cset_size
);
829 node
= rx_mk_r_cset (r_cset
, cset_size
,cs
);
836 /* This branch of the switch is normally exited with
844 /* Remember the first position in the bracket expression. */
847 /* Read in characters and ranges, setting map bits. */
852 compile_error
= REG_EBRACK
;
858 /* \ might escape characters inside [...] and [^...]. */
859 if ((syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
) && c
== '\\')
863 compile_error
= REG_EESCAPE
;
869 rx_Bitset it
= inverse_translation (n_members
,
875 rx_bitset_union (cset_size
, cs
, it
);
880 /* Could be the end of the bracket expression. If it's
881 not (i.e., when the bracket expression is `[]' so
882 far), the ']' character bit gets set way below. */
883 if (c
== ']' && p
!= p1
+ 1)
884 goto finalize_class_and_append
;
886 /* Look ahead to see if it's a range when the last thing
887 was a character class. */
888 if (had_char_class
&& c
== '-' && *p
!= ']')
890 compile_error
= REG_ERANGE
;
894 /* Look ahead to see if it's a range when the last thing
895 was a character: if this is a hyphen not at the
896 beginning or the end of a list, then it's the range
899 && !(p
- 2 >= pattern
&& p
[-2] == '[')
900 && !(p
- 3 >= pattern
&& p
[-3] == '[' && p
[-2] == '^')
904 = compile_range (n_members
, cset_size
, cs
, &p
, pend
, translate
, syntax
,
905 inverse_translate
, validate_inv_tr
);
906 if (ret
!= REG_NOERROR
)
913 else if (p
[0] == '-' && p
[1] != ']')
914 { /* This handles ranges made up of characters only. */
917 /* Move past the `-'. */
920 ret
= compile_range (n_members
, cset_size
, cs
, &p
, pend
, translate
, syntax
,
921 inverse_translate
, validate_inv_tr
);
922 if (ret
!= REG_NOERROR
)
929 /* See if we're at the beginning of a possible character
932 else if ((syntax
& RE_CHAR_CLASSES
)
933 && (c
== '[') && (*p
== ':'))
935 char str
[CHAR_CLASS_MAX_LENGTH
+ 1];
940 /* If pattern is `[[:'. */
943 compile_error
= REG_EBRACK
;
950 if (c
== ':' || c
== ']' || p
== pend
951 || c1
== CHAR_CLASS_MAX_LENGTH
)
957 /* If isn't a word bracketed by `[:' and:`]':
958 undo the ending character, the letters, and leave
959 the leading `:' and `[' (but set bits for them). */
960 if (c
== ':' && *p
== ']')
962 if (!strncmp (str
, "cut", 3))
965 if (1 != sscanf (str
+ 3, " %d", &val
))
967 compile_error
= REG_ECTYPE
;
970 /* Throw away the ]] */
974 struct rexp_node
* cut
;
975 cut
= rx_mk_r_int (r_cut
, val
);
980 else if (!strncmp (str
, "(", 1))
982 /* Throw away the ]] */
985 syntax_only_parens
= 1;
988 else if (!strncmp (str
, ")", 1))
990 /* Throw away the ]] */
993 syntax_only_parens
= 1;
999 int is_alnum
= !strcmp (str
, "alnum");
1000 int is_alpha
= !strcmp (str
, "alpha");
1001 int is_blank
= !strcmp (str
, "blank");
1002 int is_cntrl
= !strcmp (str
, "cntrl");
1003 int is_digit
= !strcmp (str
, "digit");
1004 int is_graph
= !strcmp (str
, "graph");
1005 int is_lower
= !strcmp (str
, "lower");
1006 int is_print
= !strcmp (str
, "print");
1007 int is_punct
= !strcmp (str
, "punct");
1008 int is_space
= !strcmp (str
, "space");
1009 int is_upper
= !strcmp (str
, "upper");
1010 int is_xdigit
= !strcmp (str
, "xdigit");
1012 if (!IS_CHAR_CLASS (str
))
1014 compile_error
= REG_ECTYPE
;
1018 /* Throw away the ] at the end of the character
1022 if (p
== pend
) { compile_error
= REG_EBRACK
; goto error_return
; }
1024 for (ch
= 0; ch
< 1 << CHARBITS
; ch
++)
1026 if ( (is_alnum
&& isalnum (ch
))
1027 || (is_alpha
&& isalpha (ch
))
1028 || (is_blank
&& isa_blank (ch
))
1029 || (is_cntrl
&& iscntrl (ch
))
1030 || (is_digit
&& isdigit (ch
))
1031 || (is_graph
&& isgraph (ch
))
1032 || (is_lower
&& islower (ch
))
1033 || (is_print
&& isprint (ch
))
1034 || (is_punct
&& ispunct (ch
))
1035 || (is_space
&& isspace (ch
))
1036 || (is_upper
&& isupper (ch
))
1037 || (is_xdigit
&& isxdigit (ch
)))
1040 inverse_translation (n_members
,
1046 rx_bitset_union (cset_size
,
1060 inverse_translation (n_members
,
1066 rx_bitset_union (cset_size
,
1071 inverse_translation (n_members
,
1077 rx_bitset_union (cset_size
,
1087 rx_Bitset it
= inverse_translation (n_members
,
1093 rx_bitset_union (cset_size
, cs
, it
);
1098 finalize_class_and_append
:
1101 rx_bitset_complement (cset_size
, cs
);
1102 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
1103 RX_bitset_remove (cs
, '\n');
1111 if (syntax
& RE_NO_BK_PARENS
)
1113 syntax_only_parens
= 0;
1121 if (syntax
& RE_NO_BK_PARENS
)
1123 syntax_only_parens
= 0;
1131 if (syntax
& RE_NEWLINE_ALT
)
1138 if (syntax
& RE_NO_BK_VBAR
)
1145 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1146 goto handle_interval
;
1152 if (p
== pend
) { compile_error
= REG_EESCAPE
; goto error_return
; }
1154 /* Do not translate the character after the \, so that we can
1155 distinguish, e.g., \B from \b, even if we normally would
1156 translate, e.g., B to b. */
1162 if (syntax
& RE_NO_BK_PARENS
)
1163 goto normal_backslash
;
1165 syntax_only_parens
= 0;
1168 if (!syntax_only_parens
)
1171 if (COMPILE_STACK_FULL
)
1174 = ((compile_stack_elt_t
*)
1175 realloc (compile_stack
.stack
,
1176 (compile_stack
.size
<< 1) * sizeof (compile_stack_elt_t
)));
1177 if (compile_stack
.stack
== 0)
1179 compile_stack
.size
<<= 1;
1182 if (*last_non_regular_expression
)
1184 struct rexp_node
* concat
;
1185 concat
= rx_mk_r_binop (r_concat
, *last_non_regular_expression
, 0);
1188 *last_non_regular_expression
= concat
;
1189 last_non_regular_expression
= &concat
->params
.pair
.right
;
1190 last_expression
= last_non_regular_expression
;
1194 * These are the values to restore when we hit end of this
1197 COMPILE_STACK_TOP
.top_expression
= top_expression
;
1198 COMPILE_STACK_TOP
.last_expression
= last_expression
;
1199 COMPILE_STACK_TOP
.last_non_regular_expression
= last_non_regular_expression
;
1201 if (syntax_only_parens
)
1202 COMPILE_STACK_TOP
.regnum
= -1;
1204 COMPILE_STACK_TOP
.regnum
= regnum
;
1206 compile_stack
.avail
++;
1208 top_expression
= last_non_regular_expression
;
1213 if (syntax
& RE_NO_BK_PARENS
) goto normal_backslash
;
1214 syntax_only_parens
= 0;
1217 /* See similar code for backslashed left paren above. */
1218 if (COMPILE_STACK_EMPTY
)
1219 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
1222 { compile_error
= REG_ERPAREN
; goto error_return
; }
1224 /* Since we just checked for an empty stack above, this
1229 /* We don't just want to restore into `regnum', because
1230 * later groups should continue to be numbered higher,
1231 * as in `(ab)c(de)' -- the second group is #2.
1233 regnum_t this_group_regnum
;
1234 struct rexp_node
** inner
;
1235 struct rexp_node
* parens
;
1237 inner
= top_expression
;
1238 compile_stack
.avail
--;
1240 if (!!syntax_only_parens
!= (COMPILE_STACK_TOP
.regnum
== -1))
1241 { compile_error
= REG_ERPAREN
; goto error_return
; }
1243 top_expression
= COMPILE_STACK_TOP
.top_expression
;
1244 last_expression
= COMPILE_STACK_TOP
.last_expression
;
1245 last_non_regular_expression
= COMPILE_STACK_TOP
.last_non_regular_expression
;
1246 this_group_regnum
= COMPILE_STACK_TOP
.regnum
;
1249 parens
= rx_mk_r_monop (r_parens
, *inner
);
1252 parens
->params
.intval
= this_group_regnum
;
1258 case '|': /* `\|'. */
1259 if ((syntax
& RE_LIMITED_OPS
) || (syntax
& RE_NO_BK_VBAR
))
1260 goto normal_backslash
;
1262 if (syntax
& RE_LIMITED_OPS
)
1266 struct rexp_node
* alt
;
1268 alt
= rx_mk_r_binop (r_alternate
, *top_expression
, 0);
1271 *top_expression
= alt
;
1272 last_expression
= &alt
->params
.pair
.right
;
1273 last_non_regular_expression
= &alt
->params
.pair
.right
;
1279 /* If \{ is a literal. */
1280 if (!(syntax
& RE_INTERVALS
)
1281 /* If we're at `\{' and it's not the open-interval
1283 || ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1284 || (p
- 2 == pattern
&& p
== pend
))
1285 goto normal_backslash
;
1289 /* If got here, then the syntax allows intervals.
1292 /* At least (most) this many matches must be made.
1300 /* We're about to parse the bounds of the interval.
1301 * It may turn out that this isn't an interval after
1302 * all, in which case these same characters will have
1303 * to be reparsed as literals. This remembers
1304 * the backtrack point in the parse:
1306 beg_interval
= p
- 1;
1310 if (syntax
& RE_NO_BK_BRACES
)
1311 goto unfetch_interval
;
1313 { compile_error
= REG_EBRACE
; goto error_return
; }
1316 GET_UNSIGNED_NUMBER (lower_bound
);
1320 GET_UNSIGNED_NUMBER (upper_bound
);
1321 if (upper_bound
< 0) upper_bound
= RE_DUP_MAX
;
1324 /* Interval such as `{n}' => match exactly n times.
1326 upper_bound
= lower_bound
;
1329 || upper_bound
> RE_DUP_MAX
1330 || lower_bound
> upper_bound
)
1332 if (syntax
& RE_NO_BK_BRACES
)
1333 goto unfetch_interval
;
1335 { compile_error
= REG_BADBR
; goto error_return
; }
1338 if (!(syntax
& RE_NO_BK_BRACES
))
1340 if (c
!= '\\') { compile_error
= REG_EBRACE
; goto error_return
; }
1346 if (syntax
& RE_NO_BK_BRACES
)
1347 goto unfetch_interval
;
1349 { compile_error
= REG_BADBR
; goto error_return
; }
1352 /* We just parsed a valid interval.
1353 * lower_bound and upper_bound are set.
1356 /* If it's invalid to have no preceding re.
1358 if (pointless_if_repeated (*last_expression
))
1360 if (syntax
& RE_CONTEXT_INVALID_OPS
)
1361 { compile_error
= REG_BADRPT
; goto error_return
; }
1362 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
1363 /* treat the interval spec as literal chars. */
1364 goto unfetch_interval
;
1368 struct rexp_node
* interval
;
1370 if (*last_expression
&& ((*last_expression
)->type
== r_string
))
1371 if (factor_string (&last_expression
, cset_size
))
1373 interval
= rx_mk_r_monop (r_interval
, *last_expression
);
1376 interval
->params
.intval
= lower_bound
;
1377 interval
->params
.intval2
= upper_bound
;
1378 *last_expression
= interval
;
1379 last_non_regular_expression
= last_expression
;
1386 /* If an invalid interval, match the characters as literals. */
1390 /* normal_char and normal_backslash need `c'. */
1393 if (!(syntax
& RE_NO_BK_BRACES
))
1395 if ((p
> pattern
) && (p
[-1] == '\\'))
1396 goto normal_backslash
;
1401 /* There is no way to specify the before_dot and after_dot
1402 * operators. rms says this is ok. --karl
1406 goto add_side_effect
;
1413 struct rexp_node
* set
;
1415 cs
= rx_cset (&cset_size
);
1418 set
= rx_mk_r_cset (r_cset
, cset_size
, cs
);
1425 rx_bitset_universe (cset_size
, cs
);
1430 enum syntaxcode code
= syntax_spec_code
[c
];
1431 for (x
= 0; x
< 256; ++x
)
1434 if (SYNTAX (x
) == code
)
1437 inverse_translation (n_members
,
1438 cset_size
, validate_inv_tr
,
1441 rx_bitset_xor (cset_size
, cs
, it
);
1456 struct rexp_node
* n
;
1458 cs
= rx_cset (cset_size
);
1459 n
= (cs
? rx_mk_r_cset (r_cset
, cset_size
, cs
) : 0);
1467 rx_bitset_universe (cset_size
,cs
);
1470 for (x
= cset_size
- 1; x
> 0; --x
)
1471 if (SYNTAX(x
) & Sword
)
1472 RX_bitset_toggle (cs
, x
);
1481 goto add_side_effect
;
1486 goto add_side_effect
;
1491 goto add_side_effect
;
1496 goto add_side_effect
;
1501 goto add_side_effect
;
1506 goto add_side_effect
;
1511 struct rexp_node
* se
;
1512 se
= rx_mk_r_int (r_context
, side
);
1520 case '1': case '2': case '3': case '4': case '5':
1521 case '6': case '7': case '8': case '9':
1522 if (syntax
& RE_NO_BK_REFS
)
1527 /* Can't back reference to a subexpression if inside of it. */
1528 if (group_in_compile_stack (compile_stack
, c1
))
1532 { compile_error
= REG_ESUBREG
; goto error_return
; }
1535 goto add_side_effect
;
1540 if (syntax
& RE_BK_PLUS_QM
)
1543 goto normal_backslash
;
1547 /* You might think it would be useful for \ to mean
1548 * not to translate; but if we don't translate it
1549 * it will never match anything.
1558 /* Expects the character in `c'. */
1562 struct rexp_node
* match
;
1565 it
= inverse_translation (n_members
,
1566 cset_size
, validate_inv_tr
,
1567 inverse_translate
, translate
, c
);
1569 if (1 != n_members
[c
])
1571 cs
= rx_cset (cset_size
);
1572 match
= (cs
? rx_mk_r_cset (r_cset
, cset_size
, cs
) : 0);
1579 rx_bitset_union (CHAR_SET_SIZE
, cs
, it
);
1585 if (*last_expression
&& (*last_expression
)->type
== r_string
)
1587 if (rx_adjoin_string (&((*last_expression
)->params
.cstr
), c
))
1593 append
= rx_mk_r_str (r_string
, c
);
1602 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
1603 * and then parses the next character normally.
1605 if (RX_regular_node_type (append
->type
))
1607 if (!*last_expression
)
1608 *last_expression
= append
;
1611 struct rexp_node
* concat
;
1612 concat
= rx_mk_r_binop (r_concat
,
1613 *last_expression
, append
);
1616 *last_expression
= concat
;
1617 last_expression
= &concat
->params
.pair
.right
;
1622 if (!*last_non_regular_expression
)
1624 *last_non_regular_expression
= append
;
1625 last_expression
= last_non_regular_expression
;
1629 struct rexp_node
* concat
;
1630 concat
= rx_mk_r_binop (r_concat
,
1631 *last_non_regular_expression
, append
);
1634 *last_non_regular_expression
= concat
;
1635 last_non_regular_expression
= &concat
->params
.pair
.right
;
1636 last_expression
= last_non_regular_expression
;
1641 } /* while p != pend */
1644 /* Through the pattern now. */
1646 if (!COMPILE_STACK_EMPTY
)
1647 { compile_error
= REG_EPAREN
; goto error_return
; }
1648 free (compile_stack
.stack
);
1655 compile_error
= REG_ESPACE
;
1658 free (compile_stack
.stack
);
1659 /* Free expressions pushed onto the compile stack! */
1661 rx_free_rexp (rexp
);
1662 return compile_error
;