]> jfr.im git - irc/evilnet/x3.git/blob - rx/rxsuper.h
Rewrote PHP X3 DB parser function sample code as a class and faster code
[irc/evilnet/x3.git] / rx / rxsuper.h
1 /* classes: h_files */
2
3 #ifndef RXSUPERH
4 #define RXSUPERH
5
6 /* Copyright (C) 1995, 1996 Tom Lord
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Library General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this software; see the file COPYING. If not, write to
20 * the Free Software Foundation, 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
22 */
23
24 /* lord Sun May 7 12:40:17 1995 */
25
26 \f
27
28 #include "rxnfa.h"
29
30 \f
31
32 /* This begins the description of the superstate NFA.
33 *
34 * The superstate NFA corresponds to the NFA in these ways:
35 *
36 * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
37 *
38 * Superstate edges correspond to NFA paths.
39 *
40 * The superstate has no epsilon transitions;
41 * every edge has a character label, and a (possibly empty) side
42 * effect label. The side effect label corresponds to a list of
43 * side effects that occur in the NFA. These parts are referred
44 * to as: superedge_character(EDGE) and superedge_sides(EDGE).
45 *
46 * For a superstate edge EDGE starting in some superstate SUPER,
47 * the following is true (in pseudo-notation :-):
48 *
49 * exists DEST in nfa_states s.t.
50 * exists nfaEDGE in nfa_edges s.t.
51 * origin (nfaEDGE) == DEST
52 * && origin (nfaEDGE) is a member of nfa_states(SUPER)
53 * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
54 * sides_of_possible_future (PF) == superedge_sides (EDGE)
55 *
56 * also:
57 *
58 * let SUPER2 := superedge_destination(EDGE)
59 * nfa_states(SUPER2)
60 * == union of all nfa state sets S s.t.
61 * exists PF in possible_futures(dest(nfaEDGE)) s.t.
62 * sides_of_possible_future (PF) == superedge_sides (EDGE)
63 * && S == dests_of_possible_future (PF) }
64 *
65 * Or in english, every superstate is a set of nfa states. A given
66 * character and a superstate implies many transitions in the NFA --
67 * those that begin with an edge labeled with that character from a
68 * state in the set corresponding to the superstate.
69 *
70 * The destinations of those transitions each have a set of possible
71 * futures. A possible future is a list of side effects and a set of
72 * destination NFA states. Two sets of possible futures can be
73 * `merged' by combining all pairs of possible futures that have the
74 * same side effects. A pair is combined by creating a new future
75 * with the same side effect but the union of the two destination sets.
76 * In this way, all the possible futures suggested by a superstate
77 * and a character can be merged into a set of possible futures where
78 * no two elements of the set have the same set of side effects.
79 *
80 * The destination of a possible future, being a set of NFA states,
81 * corresponds to a supernfa state. So, the merged set of possible
82 * futures we just created can serve as a set of edges in the
83 * supernfa.
84 *
85 * The representation of the superstate nfa and the nfa is critical.
86 * The nfa has to be compact, but has to facilitate the rapid
87 * computation of missing superstates. The superstate nfa has to
88 * be fast to interpret, lazilly constructed, and bounded in space.
89 *
90 * To facilitate interpretation, the superstate data structures are
91 * peppered with `instruction frames'. There is an instruction set
92 * defined below which matchers using the supernfa must be able to
93 * interpret.
94 *
95 * We'd like to make it possible but not mandatory to use code
96 * addresses to represent instructions (c.f. gcc's computed goto).
97 * Therefore, we define an enumerated type of opcodes, and when
98 * writing one of these instructions into a data structure, use
99 * the opcode as an index into a table of instruction values.
100 *
101 * Below are the opcodes that occur in the superstate nfa.
102 *
103 * The descriptions of the opcodes refer to data structures that are
104 * described further below.
105 */
106
107 enum rx_opcode
108 {
109 /*
110 * BACKTRACK_POINT is invoked when a character transition in
111 * a superstate leads to more than one edge. In that case,
112 * the edges have to be explored independently using a backtracking
113 * strategy.
114 *
115 * A BACKTRACK_POINT instruction is stored in a superstate's
116 * transition table for some character when it is known that that
117 * character crosses more than one edge. On encountering this
118 * instruction, the matcher saves enough state to backtrack to this
119 * point later in the match.
120 */
121 rx_backtrack_point = 0, /* data is (struct transition_class *) */
122
123 /*
124 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
125 * There is one occurence of this instruction per rx_distinct_future.
126 * This instruction is skipped if a rx_distinct_future has no side effects.
127 */
128 rx_do_side_effects = rx_backtrack_point + 1,
129
130 /* data is (struct rx_distinct_future *) */
131
132 /*
133 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
134 * destination superstate has been reclaimed (or was never built).
135 * It recomputes the destination superstate.
136 * RX_CACHE_MISS is also stored in a superstate transition table before
137 * any of its edges have been built.
138 */
139 rx_cache_miss = rx_do_side_effects + 1,
140 /* data is (struct rx_distinct_future *) */
141
142 /*
143 * RX_NEXT_CHAR is called to consume the next character and take the
144 * corresponding transition. This is the only instruction that uses
145 * the DATA field of the instruction frame instead of DATA_2.
146 * The comments about rx_inx explain this further.
147 */
148 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
149
150 /* RX_BACKTRACK indicates that a transition fails. Don't
151 * confuse this with rx_backtrack_point.
152 */
153 rx_backtrack = rx_next_char + 1, /* no data */
154
155 /*
156 * RX_ERROR_INX is stored only in places that should never be executed.
157 */
158 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
159
160 rx_num_instructions = rx_error_inx + 1
161 };
162
163 /* The heart of the matcher is a `word-code-interpreter'
164 * (like a byte-code interpreter, except that instructions
165 * are a full word wide).
166 *
167 * Instructions are not stored in a vector of code, instead,
168 * they are scattered throughout the data structures built
169 * by the regexp compiler and the matcher. One word-code instruction,
170 * together with the arguments to that instruction, constitute
171 * an instruction frame (struct rx_inx).
172 *
173 * This structure type is padded by hand to a power of 2 because
174 * in one of the dominant cases, we dispatch by indexing a table
175 * of instruction frames. If that indexing can be accomplished
176 * by just a shift of the index, we're happy.
177 *
178 * Instructions take at most one argument, but there are two
179 * slots in an instruction frame that might hold that argument.
180 * These are called data and data_2. The data slot is only
181 * used for one instruction (RX_NEXT_CHAR). For all other
182 * instructions, data should be set to 0.
183 *
184 * RX_NEXT_CHAR is the most important instruction by far.
185 * By reserving the data field for its exclusive use,
186 * instruction dispatch is sped up in that case. There is
187 * no need to fetch both the instruction and the data,
188 * only the data is needed. In other words, a `cycle' begins
189 * by fetching the field data. If that is non-0, then it must
190 * be the destination state of a next_char transition, so
191 * make that value the current state, advance the match position
192 * by one character, and start a new cycle. On the other hand,
193 * if data is 0, fetch the instruction and do a more complicated
194 * dispatch on that.
195 */
196
197 struct rx_inx
198 {
199 void * data;
200 void * data_2;
201 void * inx;
202 void * fnord;
203 };
204
205 #ifndef RX_TAIL_ARRAY
206 #define RX_TAIL_ARRAY 1
207 #endif
208
209 /* A superstate corresponds to a set of nfa states. Those sets are
210 * represented by STRUCT RX_SUPERSET. The constructors
211 * guarantee that only one (shared) structure is created for a given set.
212 */
213 struct rx_superset
214 {
215 int refs; /* This is a reference counted structure. */
216
217 /* We keep these sets in a cache because (in an unpredictable way),
218 * the same set is often created again and again.
219 *
220 * When an NFA is destroyed, some of the supersets for that NFA
221 * may still exist. This can lead to false cache hits -- an apparent cache
222 * hit on a superset that properly belongs to an already free NFA.
223 *
224 * When a cache hit appears to occur, we will have in hand the
225 * nfa for which it may have happened. Every nfa is given
226 * its own sequence number. The cache is validated
227 * by comparing the nfa sequence number to this field:
228 */
229 int id;
230
231 struct rx_nfa_state * car; /* May or may not be a valid addr. */
232 struct rx_superset * cdr;
233
234 /* If the corresponding superstate exists: */
235 struct rx_superstate * superstate;
236
237 /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
238 int is_final;
239
240 /* The OR of the corresponding fields of the constiuent nfa states. */
241 int has_cset_edges;
242
243
244 /* There is another bookkeeping problem. It is expensive to
245 * compute the starting nfa state set for an nfa. So, once computed,
246 * it is cached in the `struct rx'.
247 *
248 * But, the state set can be flushed from the superstate cache.
249 * When that happens, the cached value in the `struct rx' has
250 * to be flushed.
251 */
252 struct rx * starts_for;
253
254 /* This is used to link into a hash bucket so these objects can
255 * be `hash-consed'.
256 */
257 struct rx_hash_item hash_item;
258 };
259
260 #define rx_protect_superset(RX,CON) (++(CON)->refs)
261
262 /* The terminology may be confusing (rename this structure?).
263 * Every character occurs in at most one rx_super_edge per super-state.
264 * But, that structure might have more than one option, indicating a point
265 * of non-determinism.
266 *
267 * In other words, this structure holds a list of superstate edges
268 * sharing a common starting state and character label. The edges
269 * are in the field OPTIONS. All superstate edges sharing the same
270 * starting state and character are in this list.
271 */
272 struct rx_super_edge
273 {
274 struct rx_super_edge *next;
275 struct rx_inx rx_backtrack_frame;
276 int cset_size;
277 rx_Bitset cset;
278 struct rx_distinct_future *options;
279 };
280
281 /* A superstate is a set of nfa states (RX_SUPERSET) along
282 * with a transition table. Superstates are built on demand and reclaimed
283 * without warning. To protect a superstate from this ghastly fate,
284 * use LOCK_SUPERSTATE.
285 */
286 struct rx_superstate
287 {
288 int rx_id; /* c.f. the id field of rx_superset */
289 int locks; /* protection from reclamation */
290
291 /* Within a superstate cache, all the superstates are kept in a big
292 * queue. The tail of the queue is the state most likely to be
293 * reclaimed. The *recyclable fields hold the queue position of
294 * this state.
295 */
296 struct rx_superstate * next_recyclable;
297 struct rx_superstate * prev_recyclable;
298
299 /* The supernfa edges that exist in the cache and that have
300 * this state as their destination are kept in this list:
301 */
302 struct rx_distinct_future * transition_refs;
303
304 /* The list of nfa states corresponding to this superstate: */
305 struct rx_superset * contents;
306
307 /* The list of edges in the cache beginning from this state. */
308 struct rx_super_edge * edges;
309
310 /* A tail of the recyclable queue is marked as semifree. A semifree
311 * state has no incoming next_char transitions -- any transition
312 * into a semifree state causes a complex dispatch with the side
313 * effect of rescuing the state from its semifree state into a
314 * fully free state at the head of the queue.
315 *
316 * An alternative to this might be to make next_char more expensive,
317 * and to move a state to the head of the recyclable queue whenever
318 * it is entered. That way, popular states would never be recycled.
319 *
320 * But unilaterally making next_char more expensive actually loses.
321 * So, incoming transitions are only made expensive for states near
322 * the tail of the recyclable queue. The more cache contention
323 * there is, the more frequently a state will have to prove itself
324 * and be moved back to the front of the queue. If there is less
325 * contention, then popular states just aggregate in the front of
326 * the queue and stay there.
327 */
328 int is_semifree;
329
330
331 /* This keeps track of the size of the transition table for this
332 * state. There is a half-hearted attempt to support variable sized
333 * superstates.
334 */
335 int trans_size;
336
337 /* Indexed by characters... */
338 struct rx_inx transitions[RX_TAIL_ARRAY];
339 };
340
341
342 /* A list of distinct futures define the edges that leave from a
343 * given superstate on a given character. c.f. rx_super_edge.
344 */
345 struct rx_distinct_future
346 {
347 struct rx_distinct_future * next_same_super_edge[2];
348 struct rx_distinct_future * next_same_dest;
349 struct rx_distinct_future * prev_same_dest;
350 struct rx_superstate * present; /* source state */
351 struct rx_superstate * future; /* destination state */
352 struct rx_super_edge * edge;
353
354
355 /* The future_frame holds the instruction that should be executed
356 * after all the side effects are done, when it is time to complete
357 * the transition to the next state.
358 *
359 * Normally this is a next_char instruction, but it may be a
360 * cache_miss instruction as well, depending on whether or not
361 * the superstate is in the cache and semifree.
362 *
363 * If this is the only future for a given superstate/char, and
364 * if there are no side effects to be performed, this frame is
365 * not used (directly) at all. Instead, its contents are copied
366 * into the transition table of the starting state of this dist. future
367 * (a sort of goto elimination).
368 */
369 struct rx_inx future_frame;
370
371 struct rx_inx side_effects_frame;
372 struct rx_se_list * effects;
373 };
374
375 #define rx_lock_superstate(R,S) ((S)->locks++)
376 #define rx_unlock_superstate(R,S) (--(S)->locks)
377 \f
378 struct rx_cache;
379
380 #ifdef __STDC__
381 typedef void (*rx_morecore_fn)(struct rx_cache *);
382 #else
383 typedef void (*rx_morecore_fn)();
384 #endif
385
386 struct rx_cache
387 {
388 struct rx_hash_rules superset_hash_rules;
389
390 struct rx_superstate * lru_superstate;
391 struct rx_superstate * semifree_superstate;
392
393 struct rx_superset * empty_superset;
394
395 int superstates;
396 int semifree_superstates;
397 int hits;
398 int misses;
399
400 int bytes_allowed;
401 int bytes_used;
402
403 int local_cset_size;
404 void ** instruction_table;
405
406 struct rx_hash superset_table;
407 };
408
409 #ifndef RX_DEFAULT_DFA_CACHE_SIZE
410 /* This is an upper bound on the number of bytes that may (normally)
411 * be allocated for DFA states. If this threshold would be exceeded,
412 * Rx tries to flush some DFA states from the cache.
413 *
414 * This value is used whenever the rx_default_cache is used (for example,
415 * with the Posix entry points).
416 */
417 #define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
418 #endif
419
420 extern struct rx_cache * rx_default_cache;
421
422 \f
423 #ifdef __STDC__
424 extern char * rx_cache_malloc (struct rx_cache * cache, int size);
425 extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
426 extern void rx_release_superset (struct rx *rx,
427 struct rx_superset *set);
428 extern struct rx_superset * rx_superset_cons (struct rx * rx,
429 struct rx_nfa_state *car, struct rx_superset *cdr);
430 extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
431 extern struct rx_superstate * rx_superstate (struct rx *rx,
432 struct rx_superset *set);
433 extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;
434
435 #else /* STDC */
436 extern char * rx_cache_malloc ();
437 extern void rx_cache_free ();
438 extern void rx_release_superset ();
439 extern struct rx_superset * rx_superset_cons ();
440 extern struct rx_superset * rx_superstate_eclosure_union ();
441 extern struct rx_superstate * rx_superstate ();
442 extern struct rx_inx * rx_handle_cache_miss ();
443
444 #endif /* STDC */
445
446 #endif /* RXSUPERH */