]>
Commit | Line | Data |
---|---|---|
1 | /* classes: h_files */ | |
2 | ||
3 | #ifndef RXSUPERH | |
4 | #define RXSUPERH | |
5 | ||
6 | /* Copyright (C) 1995, 1996 Tom Lord | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or modify | |
9 | * it under the terms of the GNU Library General Public License as published by | |
10 | * the Free Software Foundation; either version 2, or (at your option) | |
11 | * any later version. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | * GNU Library General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Library General Public License | |
19 | * along with this software; see the file COPYING. If not, write to | |
20 | * the Free Software Foundation, 59 Temple Place - Suite 330, | |
21 | * Boston, MA 02111-1307, USA. | |
22 | */ | |
23 | ||
24 | /* lord Sun May 7 12:40:17 1995 */ | |
25 | ||
26 | \f | |
27 | ||
28 | #include "rxnfa.h" | |
29 | ||
30 | \f | |
31 | ||
32 | /* This begins the description of the superstate NFA. | |
33 | * | |
34 | * The superstate NFA corresponds to the NFA in these ways: | |
35 | * | |
36 | * Superstate states correspond to sets of NFA states (nfa_states(SUPER)), | |
37 | * | |
38 | * Superstate edges correspond to NFA paths. | |
39 | * | |
40 | * The superstate has no epsilon transitions; | |
41 | * every edge has a character label, and a (possibly empty) side | |
42 | * effect label. The side effect label corresponds to a list of | |
43 | * side effects that occur in the NFA. These parts are referred | |
44 | * to as: superedge_character(EDGE) and superedge_sides(EDGE). | |
45 | * | |
46 | * For a superstate edge EDGE starting in some superstate SUPER, | |
47 | * the following is true (in pseudo-notation :-): | |
48 | * | |
49 | * exists DEST in nfa_states s.t. | |
50 | * exists nfaEDGE in nfa_edges s.t. | |
51 | * origin (nfaEDGE) == DEST | |
52 | * && origin (nfaEDGE) is a member of nfa_states(SUPER) | |
53 | * && exists PF in possible_futures(dest(nfaEDGE)) s.t. | |
54 | * sides_of_possible_future (PF) == superedge_sides (EDGE) | |
55 | * | |
56 | * also: | |
57 | * | |
58 | * let SUPER2 := superedge_destination(EDGE) | |
59 | * nfa_states(SUPER2) | |
60 | * == union of all nfa state sets S s.t. | |
61 | * exists PF in possible_futures(dest(nfaEDGE)) s.t. | |
62 | * sides_of_possible_future (PF) == superedge_sides (EDGE) | |
63 | * && S == dests_of_possible_future (PF) } | |
64 | * | |
65 | * Or in english, every superstate is a set of nfa states. A given | |
66 | * character and a superstate implies many transitions in the NFA -- | |
67 | * those that begin with an edge labeled with that character from a | |
68 | * state in the set corresponding to the superstate. | |
69 | * | |
70 | * The destinations of those transitions each have a set of possible | |
71 | * futures. A possible future is a list of side effects and a set of | |
72 | * destination NFA states. Two sets of possible futures can be | |
73 | * `merged' by combining all pairs of possible futures that have the | |
74 | * same side effects. A pair is combined by creating a new future | |
75 | * with the same side effect but the union of the two destination sets. | |
76 | * In this way, all the possible futures suggested by a superstate | |
77 | * and a character can be merged into a set of possible futures where | |
78 | * no two elements of the set have the same set of side effects. | |
79 | * | |
80 | * The destination of a possible future, being a set of NFA states, | |
81 | * corresponds to a supernfa state. So, the merged set of possible | |
82 | * futures we just created can serve as a set of edges in the | |
83 | * supernfa. | |
84 | * | |
85 | * The representation of the superstate nfa and the nfa is critical. | |
86 | * The nfa has to be compact, but has to facilitate the rapid | |
87 | * computation of missing superstates. The superstate nfa has to | |
88 | * be fast to interpret, lazilly constructed, and bounded in space. | |
89 | * | |
90 | * To facilitate interpretation, the superstate data structures are | |
91 | * peppered with `instruction frames'. There is an instruction set | |
92 | * defined below which matchers using the supernfa must be able to | |
93 | * interpret. | |
94 | * | |
95 | * We'd like to make it possible but not mandatory to use code | |
96 | * addresses to represent instructions (c.f. gcc's computed goto). | |
97 | * Therefore, we define an enumerated type of opcodes, and when | |
98 | * writing one of these instructions into a data structure, use | |
99 | * the opcode as an index into a table of instruction values. | |
100 | * | |
101 | * Below are the opcodes that occur in the superstate nfa. | |
102 | * | |
103 | * The descriptions of the opcodes refer to data structures that are | |
104 | * described further below. | |
105 | */ | |
106 | ||
107 | enum rx_opcode | |
108 | { | |
109 | /* | |
110 | * BACKTRACK_POINT is invoked when a character transition in | |
111 | * a superstate leads to more than one edge. In that case, | |
112 | * the edges have to be explored independently using a backtracking | |
113 | * strategy. | |
114 | * | |
115 | * A BACKTRACK_POINT instruction is stored in a superstate's | |
116 | * transition table for some character when it is known that that | |
117 | * character crosses more than one edge. On encountering this | |
118 | * instruction, the matcher saves enough state to backtrack to this | |
119 | * point later in the match. | |
120 | */ | |
121 | rx_backtrack_point = 0, /* data is (struct transition_class *) */ | |
122 | ||
123 | /* | |
124 | * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. | |
125 | * There is one occurence of this instruction per rx_distinct_future. | |
126 | * This instruction is skipped if a rx_distinct_future has no side effects. | |
127 | */ | |
128 | rx_do_side_effects = rx_backtrack_point + 1, | |
129 | ||
130 | /* data is (struct rx_distinct_future *) */ | |
131 | ||
132 | /* | |
133 | * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose | |
134 | * destination superstate has been reclaimed (or was never built). | |
135 | * It recomputes the destination superstate. | |
136 | * RX_CACHE_MISS is also stored in a superstate transition table before | |
137 | * any of its edges have been built. | |
138 | */ | |
139 | rx_cache_miss = rx_do_side_effects + 1, | |
140 | /* data is (struct rx_distinct_future *) */ | |
141 | ||
142 | /* | |
143 | * RX_NEXT_CHAR is called to consume the next character and take the | |
144 | * corresponding transition. This is the only instruction that uses | |
145 | * the DATA field of the instruction frame instead of DATA_2. | |
146 | * The comments about rx_inx explain this further. | |
147 | */ | |
148 | rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ | |
149 | ||
150 | /* RX_BACKTRACK indicates that a transition fails. Don't | |
151 | * confuse this with rx_backtrack_point. | |
152 | */ | |
153 | rx_backtrack = rx_next_char + 1, /* no data */ | |
154 | ||
155 | /* | |
156 | * RX_ERROR_INX is stored only in places that should never be executed. | |
157 | */ | |
158 | rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ | |
159 | ||
160 | rx_num_instructions = rx_error_inx + 1 | |
161 | }; | |
162 | ||
163 | /* The heart of the matcher is a `word-code-interpreter' | |
164 | * (like a byte-code interpreter, except that instructions | |
165 | * are a full word wide). | |
166 | * | |
167 | * Instructions are not stored in a vector of code, instead, | |
168 | * they are scattered throughout the data structures built | |
169 | * by the regexp compiler and the matcher. One word-code instruction, | |
170 | * together with the arguments to that instruction, constitute | |
171 | * an instruction frame (struct rx_inx). | |
172 | * | |
173 | * This structure type is padded by hand to a power of 2 because | |
174 | * in one of the dominant cases, we dispatch by indexing a table | |
175 | * of instruction frames. If that indexing can be accomplished | |
176 | * by just a shift of the index, we're happy. | |
177 | * | |
178 | * Instructions take at most one argument, but there are two | |
179 | * slots in an instruction frame that might hold that argument. | |
180 | * These are called data and data_2. The data slot is only | |
181 | * used for one instruction (RX_NEXT_CHAR). For all other | |
182 | * instructions, data should be set to 0. | |
183 | * | |
184 | * RX_NEXT_CHAR is the most important instruction by far. | |
185 | * By reserving the data field for its exclusive use, | |
186 | * instruction dispatch is sped up in that case. There is | |
187 | * no need to fetch both the instruction and the data, | |
188 | * only the data is needed. In other words, a `cycle' begins | |
189 | * by fetching the field data. If that is non-0, then it must | |
190 | * be the destination state of a next_char transition, so | |
191 | * make that value the current state, advance the match position | |
192 | * by one character, and start a new cycle. On the other hand, | |
193 | * if data is 0, fetch the instruction and do a more complicated | |
194 | * dispatch on that. | |
195 | */ | |
196 | ||
197 | struct rx_inx | |
198 | { | |
199 | void * data; | |
200 | void * data_2; | |
201 | void * inx; | |
202 | void * fnord; | |
203 | }; | |
204 | ||
205 | #ifndef RX_TAIL_ARRAY | |
206 | #define RX_TAIL_ARRAY 1 | |
207 | #endif | |
208 | ||
209 | /* A superstate corresponds to a set of nfa states. Those sets are | |
210 | * represented by STRUCT RX_SUPERSET. The constructors | |
211 | * guarantee that only one (shared) structure is created for a given set. | |
212 | */ | |
213 | struct rx_superset | |
214 | { | |
215 | int refs; /* This is a reference counted structure. */ | |
216 | ||
217 | /* We keep these sets in a cache because (in an unpredictable way), | |
218 | * the same set is often created again and again. | |
219 | * | |
220 | * When an NFA is destroyed, some of the supersets for that NFA | |
221 | * may still exist. This can lead to false cache hits -- an apparent cache | |
222 | * hit on a superset that properly belongs to an already free NFA. | |
223 | * | |
224 | * When a cache hit appears to occur, we will have in hand the | |
225 | * nfa for which it may have happened. Every nfa is given | |
226 | * its own sequence number. The cache is validated | |
227 | * by comparing the nfa sequence number to this field: | |
228 | */ | |
229 | int id; | |
230 | ||
231 | struct rx_nfa_state * car; /* May or may not be a valid addr. */ | |
232 | struct rx_superset * cdr; | |
233 | ||
234 | /* If the corresponding superstate exists: */ | |
235 | struct rx_superstate * superstate; | |
236 | ||
237 | /* That is_final field of the constiuent nfa states which has the greatest magnitude. */ | |
238 | int is_final; | |
239 | ||
240 | /* The OR of the corresponding fields of the constiuent nfa states. */ | |
241 | int has_cset_edges; | |
242 | ||
243 | ||
244 | /* There is another bookkeeping problem. It is expensive to | |
245 | * compute the starting nfa state set for an nfa. So, once computed, | |
246 | * it is cached in the `struct rx'. | |
247 | * | |
248 | * But, the state set can be flushed from the superstate cache. | |
249 | * When that happens, the cached value in the `struct rx' has | |
250 | * to be flushed. | |
251 | */ | |
252 | struct rx * starts_for; | |
253 | ||
254 | /* This is used to link into a hash bucket so these objects can | |
255 | * be `hash-consed'. | |
256 | */ | |
257 | struct rx_hash_item hash_item; | |
258 | }; | |
259 | ||
260 | #define rx_protect_superset(RX,CON) (++(CON)->refs) | |
261 | ||
262 | /* The terminology may be confusing (rename this structure?). | |
263 | * Every character occurs in at most one rx_super_edge per super-state. | |
264 | * But, that structure might have more than one option, indicating a point | |
265 | * of non-determinism. | |
266 | * | |
267 | * In other words, this structure holds a list of superstate edges | |
268 | * sharing a common starting state and character label. The edges | |
269 | * are in the field OPTIONS. All superstate edges sharing the same | |
270 | * starting state and character are in this list. | |
271 | */ | |
272 | struct rx_super_edge | |
273 | { | |
274 | struct rx_super_edge *next; | |
275 | struct rx_inx rx_backtrack_frame; | |
276 | int cset_size; | |
277 | rx_Bitset cset; | |
278 | struct rx_distinct_future *options; | |
279 | }; | |
280 | ||
281 | /* A superstate is a set of nfa states (RX_SUPERSET) along | |
282 | * with a transition table. Superstates are built on demand and reclaimed | |
283 | * without warning. To protect a superstate from this ghastly fate, | |
284 | * use LOCK_SUPERSTATE. | |
285 | */ | |
286 | struct rx_superstate | |
287 | { | |
288 | int rx_id; /* c.f. the id field of rx_superset */ | |
289 | int locks; /* protection from reclamation */ | |
290 | ||
291 | /* Within a superstate cache, all the superstates are kept in a big | |
292 | * queue. The tail of the queue is the state most likely to be | |
293 | * reclaimed. The *recyclable fields hold the queue position of | |
294 | * this state. | |
295 | */ | |
296 | struct rx_superstate * next_recyclable; | |
297 | struct rx_superstate * prev_recyclable; | |
298 | ||
299 | /* The supernfa edges that exist in the cache and that have | |
300 | * this state as their destination are kept in this list: | |
301 | */ | |
302 | struct rx_distinct_future * transition_refs; | |
303 | ||
304 | /* The list of nfa states corresponding to this superstate: */ | |
305 | struct rx_superset * contents; | |
306 | ||
307 | /* The list of edges in the cache beginning from this state. */ | |
308 | struct rx_super_edge * edges; | |
309 | ||
310 | /* A tail of the recyclable queue is marked as semifree. A semifree | |
311 | * state has no incoming next_char transitions -- any transition | |
312 | * into a semifree state causes a complex dispatch with the side | |
313 | * effect of rescuing the state from its semifree state into a | |
314 | * fully free state at the head of the queue. | |
315 | * | |
316 | * An alternative to this might be to make next_char more expensive, | |
317 | * and to move a state to the head of the recyclable queue whenever | |
318 | * it is entered. That way, popular states would never be recycled. | |
319 | * | |
320 | * But unilaterally making next_char more expensive actually loses. | |
321 | * So, incoming transitions are only made expensive for states near | |
322 | * the tail of the recyclable queue. The more cache contention | |
323 | * there is, the more frequently a state will have to prove itself | |
324 | * and be moved back to the front of the queue. If there is less | |
325 | * contention, then popular states just aggregate in the front of | |
326 | * the queue and stay there. | |
327 | */ | |
328 | int is_semifree; | |
329 | ||
330 | ||
331 | /* This keeps track of the size of the transition table for this | |
332 | * state. There is a half-hearted attempt to support variable sized | |
333 | * superstates. | |
334 | */ | |
335 | int trans_size; | |
336 | ||
337 | /* Indexed by characters... */ | |
338 | struct rx_inx transitions[RX_TAIL_ARRAY]; | |
339 | }; | |
340 | ||
341 | ||
342 | /* A list of distinct futures define the edges that leave from a | |
343 | * given superstate on a given character. c.f. rx_super_edge. | |
344 | */ | |
345 | struct rx_distinct_future | |
346 | { | |
347 | struct rx_distinct_future * next_same_super_edge[2]; | |
348 | struct rx_distinct_future * next_same_dest; | |
349 | struct rx_distinct_future * prev_same_dest; | |
350 | struct rx_superstate * present; /* source state */ | |
351 | struct rx_superstate * future; /* destination state */ | |
352 | struct rx_super_edge * edge; | |
353 | ||
354 | ||
355 | /* The future_frame holds the instruction that should be executed | |
356 | * after all the side effects are done, when it is time to complete | |
357 | * the transition to the next state. | |
358 | * | |
359 | * Normally this is a next_char instruction, but it may be a | |
360 | * cache_miss instruction as well, depending on whether or not | |
361 | * the superstate is in the cache and semifree. | |
362 | * | |
363 | * If this is the only future for a given superstate/char, and | |
364 | * if there are no side effects to be performed, this frame is | |
365 | * not used (directly) at all. Instead, its contents are copied | |
366 | * into the transition table of the starting state of this dist. future | |
367 | * (a sort of goto elimination). | |
368 | */ | |
369 | struct rx_inx future_frame; | |
370 | ||
371 | struct rx_inx side_effects_frame; | |
372 | struct rx_se_list * effects; | |
373 | }; | |
374 | ||
375 | #define rx_lock_superstate(R,S) ((S)->locks++) | |
376 | #define rx_unlock_superstate(R,S) (--(S)->locks) | |
377 | \f | |
378 | struct rx_cache; | |
379 | ||
380 | #ifdef __STDC__ | |
381 | typedef void (*rx_morecore_fn)(struct rx_cache *); | |
382 | #else | |
383 | typedef void (*rx_morecore_fn)(); | |
384 | #endif | |
385 | ||
386 | struct rx_cache | |
387 | { | |
388 | struct rx_hash_rules superset_hash_rules; | |
389 | ||
390 | struct rx_superstate * lru_superstate; | |
391 | struct rx_superstate * semifree_superstate; | |
392 | ||
393 | struct rx_superset * empty_superset; | |
394 | ||
395 | int superstates; | |
396 | int semifree_superstates; | |
397 | int hits; | |
398 | int misses; | |
399 | ||
400 | int bytes_allowed; | |
401 | int bytes_used; | |
402 | ||
403 | int local_cset_size; | |
404 | void ** instruction_table; | |
405 | ||
406 | struct rx_hash superset_table; | |
407 | }; | |
408 | ||
409 | #ifndef RX_DEFAULT_DFA_CACHE_SIZE | |
410 | /* This is an upper bound on the number of bytes that may (normally) | |
411 | * be allocated for DFA states. If this threshold would be exceeded, | |
412 | * Rx tries to flush some DFA states from the cache. | |
413 | * | |
414 | * This value is used whenever the rx_default_cache is used (for example, | |
415 | * with the Posix entry points). | |
416 | */ | |
417 | #define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19) | |
418 | #endif | |
419 | ||
420 | extern struct rx_cache * rx_default_cache; | |
421 | ||
422 | \f | |
423 | #ifdef __STDC__ | |
424 | extern char * rx_cache_malloc (struct rx_cache * cache, int size); | |
425 | extern void rx_cache_free (struct rx_cache * cache, int size, char * mem); | |
426 | extern void rx_release_superset (struct rx *rx, | |
427 | struct rx_superset *set); | |
428 | extern struct rx_superset * rx_superset_cons (struct rx * rx, | |
429 | struct rx_nfa_state *car, struct rx_superset *cdr); | |
430 | extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ; | |
431 | extern struct rx_superstate * rx_superstate (struct rx *rx, | |
432 | struct rx_superset *set); | |
433 | extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ; | |
434 | ||
435 | #else /* STDC */ | |
436 | extern char * rx_cache_malloc (); | |
437 | extern void rx_cache_free (); | |
438 | extern void rx_release_superset (); | |
439 | extern struct rx_superset * rx_superset_cons (); | |
440 | extern struct rx_superset * rx_superstate_eclosure_union (); | |
441 | extern struct rx_superstate * rx_superstate (); | |
442 | extern struct rx_inx * rx_handle_cache_miss (); | |
443 | ||
444 | #endif /* STDC */ | |
445 | ||
446 | #endif /* RXSUPERH */ |