6 /* Copyright (C) 1995, 1996 Tom Lord
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Library General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this software; see the file COPYING. If not, write to
20 * the Free Software Foundation, 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
24 /* lord Sun May 7 12:40:17 1995 */
32 /* This begins the description of the superstate NFA.
34 * The superstate NFA corresponds to the NFA in these ways:
36 * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
38 * Superstate edges correspond to NFA paths.
40 * The superstate has no epsilon transitions;
41 * every edge has a character label, and a (possibly empty) side
42 * effect label. The side effect label corresponds to a list of
43 * side effects that occur in the NFA. These parts are referred
44 * to as: superedge_character(EDGE) and superedge_sides(EDGE).
46 * For a superstate edge EDGE starting in some superstate SUPER,
47 * the following is true (in pseudo-notation :-):
49 * exists DEST in nfa_states s.t.
50 * exists nfaEDGE in nfa_edges s.t.
51 * origin (nfaEDGE) == DEST
52 * && origin (nfaEDGE) is a member of nfa_states(SUPER)
53 * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
54 * sides_of_possible_future (PF) == superedge_sides (EDGE)
58 * let SUPER2 := superedge_destination(EDGE)
60 * == union of all nfa state sets S s.t.
61 * exists PF in possible_futures(dest(nfaEDGE)) s.t.
62 * sides_of_possible_future (PF) == superedge_sides (EDGE)
63 * && S == dests_of_possible_future (PF) }
65 * Or in english, every superstate is a set of nfa states. A given
66 * character and a superstate implies many transitions in the NFA --
67 * those that begin with an edge labeled with that character from a
68 * state in the set corresponding to the superstate.
70 * The destinations of those transitions each have a set of possible
71 * futures. A possible future is a list of side effects and a set of
72 * destination NFA states. Two sets of possible futures can be
73 * `merged' by combining all pairs of possible futures that have the
74 * same side effects. A pair is combined by creating a new future
75 * with the same side effect but the union of the two destination sets.
76 * In this way, all the possible futures suggested by a superstate
77 * and a character can be merged into a set of possible futures where
78 * no two elements of the set have the same set of side effects.
80 * The destination of a possible future, being a set of NFA states,
81 * corresponds to a supernfa state. So, the merged set of possible
82 * futures we just created can serve as a set of edges in the
85 * The representation of the superstate nfa and the nfa is critical.
86 * The nfa has to be compact, but has to facilitate the rapid
87 * computation of missing superstates. The superstate nfa has to
88 * be fast to interpret, lazilly constructed, and bounded in space.
90 * To facilitate interpretation, the superstate data structures are
91 * peppered with `instruction frames'. There is an instruction set
92 * defined below which matchers using the supernfa must be able to
95 * We'd like to make it possible but not mandatory to use code
96 * addresses to represent instructions (c.f. gcc's computed goto).
97 * Therefore, we define an enumerated type of opcodes, and when
98 * writing one of these instructions into a data structure, use
99 * the opcode as an index into a table of instruction values.
101 * Below are the opcodes that occur in the superstate nfa.
103 * The descriptions of the opcodes refer to data structures that are
104 * described further below.
110 * BACKTRACK_POINT is invoked when a character transition in
111 * a superstate leads to more than one edge. In that case,
112 * the edges have to be explored independently using a backtracking
115 * A BACKTRACK_POINT instruction is stored in a superstate's
116 * transition table for some character when it is known that that
117 * character crosses more than one edge. On encountering this
118 * instruction, the matcher saves enough state to backtrack to this
119 * point later in the match.
121 rx_backtrack_point
= 0, /* data is (struct transition_class *) */
124 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
125 * There is one occurence of this instruction per rx_distinct_future.
126 * This instruction is skipped if a rx_distinct_future has no side effects.
128 rx_do_side_effects
= rx_backtrack_point
+ 1,
130 /* data is (struct rx_distinct_future *) */
133 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
134 * destination superstate has been reclaimed (or was never built).
135 * It recomputes the destination superstate.
136 * RX_CACHE_MISS is also stored in a superstate transition table before
137 * any of its edges have been built.
139 rx_cache_miss
= rx_do_side_effects
+ 1,
140 /* data is (struct rx_distinct_future *) */
143 * RX_NEXT_CHAR is called to consume the next character and take the
144 * corresponding transition. This is the only instruction that uses
145 * the DATA field of the instruction frame instead of DATA_2.
146 * The comments about rx_inx explain this further.
148 rx_next_char
= rx_cache_miss
+ 1, /* data is (struct superstate *) */
150 /* RX_BACKTRACK indicates that a transition fails. Don't
151 * confuse this with rx_backtrack_point.
153 rx_backtrack
= rx_next_char
+ 1, /* no data */
156 * RX_ERROR_INX is stored only in places that should never be executed.
158 rx_error_inx
= rx_backtrack
+ 1, /* Not supposed to occur. */
160 rx_num_instructions
= rx_error_inx
+ 1
163 /* The heart of the matcher is a `word-code-interpreter'
164 * (like a byte-code interpreter, except that instructions
165 * are a full word wide).
167 * Instructions are not stored in a vector of code, instead,
168 * they are scattered throughout the data structures built
169 * by the regexp compiler and the matcher. One word-code instruction,
170 * together with the arguments to that instruction, constitute
171 * an instruction frame (struct rx_inx).
173 * This structure type is padded by hand to a power of 2 because
174 * in one of the dominant cases, we dispatch by indexing a table
175 * of instruction frames. If that indexing can be accomplished
176 * by just a shift of the index, we're happy.
178 * Instructions take at most one argument, but there are two
179 * slots in an instruction frame that might hold that argument.
180 * These are called data and data_2. The data slot is only
181 * used for one instruction (RX_NEXT_CHAR). For all other
182 * instructions, data should be set to 0.
184 * RX_NEXT_CHAR is the most important instruction by far.
185 * By reserving the data field for its exclusive use,
186 * instruction dispatch is sped up in that case. There is
187 * no need to fetch both the instruction and the data,
188 * only the data is needed. In other words, a `cycle' begins
189 * by fetching the field data. If that is non-0, then it must
190 * be the destination state of a next_char transition, so
191 * make that value the current state, advance the match position
192 * by one character, and start a new cycle. On the other hand,
193 * if data is 0, fetch the instruction and do a more complicated
205 #ifndef RX_TAIL_ARRAY
206 #define RX_TAIL_ARRAY 1
209 /* A superstate corresponds to a set of nfa states. Those sets are
210 * represented by STRUCT RX_SUPERSET. The constructors
211 * guarantee that only one (shared) structure is created for a given set.
215 int refs
; /* This is a reference counted structure. */
217 /* We keep these sets in a cache because (in an unpredictable way),
218 * the same set is often created again and again.
220 * When an NFA is destroyed, some of the supersets for that NFA
221 * may still exist. This can lead to false cache hits -- an apparent cache
222 * hit on a superset that properly belongs to an already free NFA.
224 * When a cache hit appears to occur, we will have in hand the
225 * nfa for which it may have happened. Every nfa is given
226 * its own sequence number. The cache is validated
227 * by comparing the nfa sequence number to this field:
231 struct rx_nfa_state
* car
; /* May or may not be a valid addr. */
232 struct rx_superset
* cdr
;
234 /* If the corresponding superstate exists: */
235 struct rx_superstate
* superstate
;
237 /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
240 /* The OR of the corresponding fields of the constiuent nfa states. */
244 /* There is another bookkeeping problem. It is expensive to
245 * compute the starting nfa state set for an nfa. So, once computed,
246 * it is cached in the `struct rx'.
248 * But, the state set can be flushed from the superstate cache.
249 * When that happens, the cached value in the `struct rx' has
252 struct rx
* starts_for
;
254 /* This is used to link into a hash bucket so these objects can
257 struct rx_hash_item hash_item
;
260 #define rx_protect_superset(RX,CON) (++(CON)->refs)
262 /* The terminology may be confusing (rename this structure?).
263 * Every character occurs in at most one rx_super_edge per super-state.
264 * But, that structure might have more than one option, indicating a point
265 * of non-determinism.
267 * In other words, this structure holds a list of superstate edges
268 * sharing a common starting state and character label. The edges
269 * are in the field OPTIONS. All superstate edges sharing the same
270 * starting state and character are in this list.
274 struct rx_super_edge
*next
;
275 struct rx_inx rx_backtrack_frame
;
278 struct rx_distinct_future
*options
;
281 /* A superstate is a set of nfa states (RX_SUPERSET) along
282 * with a transition table. Superstates are built on demand and reclaimed
283 * without warning. To protect a superstate from this ghastly fate,
284 * use LOCK_SUPERSTATE.
288 int rx_id
; /* c.f. the id field of rx_superset */
289 int locks
; /* protection from reclamation */
291 /* Within a superstate cache, all the superstates are kept in a big
292 * queue. The tail of the queue is the state most likely to be
293 * reclaimed. The *recyclable fields hold the queue position of
296 struct rx_superstate
* next_recyclable
;
297 struct rx_superstate
* prev_recyclable
;
299 /* The supernfa edges that exist in the cache and that have
300 * this state as their destination are kept in this list:
302 struct rx_distinct_future
* transition_refs
;
304 /* The list of nfa states corresponding to this superstate: */
305 struct rx_superset
* contents
;
307 /* The list of edges in the cache beginning from this state. */
308 struct rx_super_edge
* edges
;
310 /* A tail of the recyclable queue is marked as semifree. A semifree
311 * state has no incoming next_char transitions -- any transition
312 * into a semifree state causes a complex dispatch with the side
313 * effect of rescuing the state from its semifree state into a
314 * fully free state at the head of the queue.
316 * An alternative to this might be to make next_char more expensive,
317 * and to move a state to the head of the recyclable queue whenever
318 * it is entered. That way, popular states would never be recycled.
320 * But unilaterally making next_char more expensive actually loses.
321 * So, incoming transitions are only made expensive for states near
322 * the tail of the recyclable queue. The more cache contention
323 * there is, the more frequently a state will have to prove itself
324 * and be moved back to the front of the queue. If there is less
325 * contention, then popular states just aggregate in the front of
326 * the queue and stay there.
331 /* This keeps track of the size of the transition table for this
332 * state. There is a half-hearted attempt to support variable sized
337 /* Indexed by characters... */
338 struct rx_inx transitions
[RX_TAIL_ARRAY
];
342 /* A list of distinct futures define the edges that leave from a
343 * given superstate on a given character. c.f. rx_super_edge.
345 struct rx_distinct_future
347 struct rx_distinct_future
* next_same_super_edge
[2];
348 struct rx_distinct_future
* next_same_dest
;
349 struct rx_distinct_future
* prev_same_dest
;
350 struct rx_superstate
* present
; /* source state */
351 struct rx_superstate
* future
; /* destination state */
352 struct rx_super_edge
* edge
;
355 /* The future_frame holds the instruction that should be executed
356 * after all the side effects are done, when it is time to complete
357 * the transition to the next state.
359 * Normally this is a next_char instruction, but it may be a
360 * cache_miss instruction as well, depending on whether or not
361 * the superstate is in the cache and semifree.
363 * If this is the only future for a given superstate/char, and
364 * if there are no side effects to be performed, this frame is
365 * not used (directly) at all. Instead, its contents are copied
366 * into the transition table of the starting state of this dist. future
367 * (a sort of goto elimination).
369 struct rx_inx future_frame
;
371 struct rx_inx side_effects_frame
;
372 struct rx_se_list
* effects
;
375 #define rx_lock_superstate(R,S) ((S)->locks++)
376 #define rx_unlock_superstate(R,S) (--(S)->locks)
381 typedef void (*rx_morecore_fn
)(struct rx_cache
*);
383 typedef void (*rx_morecore_fn
)();
388 struct rx_hash_rules superset_hash_rules
;
390 struct rx_superstate
* lru_superstate
;
391 struct rx_superstate
* semifree_superstate
;
393 struct rx_superset
* empty_superset
;
396 int semifree_superstates
;
404 void ** instruction_table
;
406 struct rx_hash superset_table
;
409 #ifndef RX_DEFAULT_DFA_CACHE_SIZE
410 /* This is an upper bound on the number of bytes that may (normally)
411 * be allocated for DFA states. If this threshold would be exceeded,
412 * Rx tries to flush some DFA states from the cache.
414 * This value is used whenever the rx_default_cache is used (for example,
415 * with the Posix entry points).
417 #define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
420 extern struct rx_cache
* rx_default_cache
;
424 extern char * rx_cache_malloc (struct rx_cache
* cache
, int size
);
425 extern void rx_cache_free (struct rx_cache
* cache
, int size
, char * mem
);
426 extern void rx_release_superset (struct rx
*rx
,
427 struct rx_superset
*set
);
428 extern struct rx_superset
* rx_superset_cons (struct rx
* rx
,
429 struct rx_nfa_state
*car
, struct rx_superset
*cdr
);
430 extern struct rx_superset
* rx_superstate_eclosure_union (struct rx
* rx
, struct rx_superset
*set
, struct rx_nfa_state_set
*ecl
) ;
431 extern struct rx_superstate
* rx_superstate (struct rx
*rx
,
432 struct rx_superset
*set
);
433 extern struct rx_inx
* rx_handle_cache_miss (struct rx
*rx
, struct rx_superstate
*super
, unsigned char chr
, void *data
) ;
436 extern char * rx_cache_malloc ();
437 extern void rx_cache_free ();
438 extern void rx_release_superset ();
439 extern struct rx_superset
* rx_superset_cons ();
440 extern struct rx_superset
* rx_superstate_eclosure_union ();
441 extern struct rx_superstate
* rx_superstate ();
442 extern struct rx_inx
* rx_handle_cache_miss ();
446 #endif /* RXSUPERH */