jfr.im git - irc/evilnet/x3.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* classes: h_files */
	2
	3	#ifndef RXSUPERH
	4	#define RXSUPERH
	5
	6	/* Copyright (C) 1995, 1996 Tom Lord
	7	*
	8	* This program is free software; you can redistribute it and/or modify
	9	* it under the terms of the GNU Library General Public License as published by
	10	* the Free Software Foundation; either version 2, or (at your option)
	11	* any later version.
	12	*
	13	* This program is distributed in the hope that it will be useful,
	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	* GNU Library General Public License for more details.
	17	*
	18	* You should have received a copy of the GNU Library General Public License
	19	* along with this software; see the file COPYING. If not, write to
	20	* the Free Software Foundation, 59 Temple Place - Suite 330,
	21	* Boston, MA 02111-1307, USA.
	22	*/
	23
	24	/* lord Sun May 7 12:40:17 1995 */
	25
	26	\f
	27
	28	#include "rxnfa.h"
	29
	30	\f
	31
	32	/* This begins the description of the superstate NFA.
	33	*
	34	* The superstate NFA corresponds to the NFA in these ways:
	35	*
	36	* Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
	37	*
	38	* Superstate edges correspond to NFA paths.
	39	*
	40	* The superstate has no epsilon transitions;
	41	* every edge has a character label, and a (possibly empty) side
	42	* effect label. The side effect label corresponds to a list of
	43	* side effects that occur in the NFA. These parts are referred
	44	* to as: superedge_character(EDGE) and superedge_sides(EDGE).
	45	*
	46	* For a superstate edge EDGE starting in some superstate SUPER,
	47	* the following is true (in pseudo-notation :-):
	48	*
	49	* exists DEST in nfa_states s.t.
	50	* exists nfaEDGE in nfa_edges s.t.
	51	* origin (nfaEDGE) == DEST
	52	* && origin (nfaEDGE) is a member of nfa_states(SUPER)
	53	* && exists PF in possible_futures(dest(nfaEDGE)) s.t.
	54	* sides_of_possible_future (PF) == superedge_sides (EDGE)
	55	*
	56	* also:
	57	*
	58	* let SUPER2 := superedge_destination(EDGE)
	59	* nfa_states(SUPER2)
	60	* == union of all nfa state sets S s.t.
	61	* exists PF in possible_futures(dest(nfaEDGE)) s.t.
	62	* sides_of_possible_future (PF) == superedge_sides (EDGE)
	63	* && S == dests_of_possible_future (PF) }
	64	*
	65	* Or in english, every superstate is a set of nfa states. A given
	66	* character and a superstate implies many transitions in the NFA --
	67	* those that begin with an edge labeled with that character from a
	68	* state in the set corresponding to the superstate.
	69	*
	70	* The destinations of those transitions each have a set of possible
	71	* futures. A possible future is a list of side effects and a set of
	72	* destination NFA states. Two sets of possible futures can be
	73	* `merged' by combining all pairs of possible futures that have the
	74	* same side effects. A pair is combined by creating a new future
	75	* with the same side effect but the union of the two destination sets.
	76	* In this way, all the possible futures suggested by a superstate
	77	* and a character can be merged into a set of possible futures where
	78	* no two elements of the set have the same set of side effects.
	79	*
	80	* The destination of a possible future, being a set of NFA states,
	81	* corresponds to a supernfa state. So, the merged set of possible
	82	* futures we just created can serve as a set of edges in the
	83	* supernfa.
	84	*
	85	* The representation of the superstate nfa and the nfa is critical.
	86	* The nfa has to be compact, but has to facilitate the rapid
	87	* computation of missing superstates. The superstate nfa has to
	88	* be fast to interpret, lazilly constructed, and bounded in space.
	89	*
	90	* To facilitate interpretation, the superstate data structures are
	91	* peppered with `instruction frames'. There is an instruction set
	92	* defined below which matchers using the supernfa must be able to
	93	* interpret.
	94	*
	95	* We'd like to make it possible but not mandatory to use code
	96	* addresses to represent instructions (c.f. gcc's computed goto).
	97	* Therefore, we define an enumerated type of opcodes, and when
	98	* writing one of these instructions into a data structure, use
	99	* the opcode as an index into a table of instruction values.
	100	*
	101	* Below are the opcodes that occur in the superstate nfa.
	102	*
	103	* The descriptions of the opcodes refer to data structures that are
	104	* described further below.
	105	*/
	106
	107	enum rx_opcode
	108	{
	109	/*
	110	* BACKTRACK_POINT is invoked when a character transition in
	111	* a superstate leads to more than one edge. In that case,
	112	* the edges have to be explored independently using a backtracking
	113	* strategy.
	114	*
	115	* A BACKTRACK_POINT instruction is stored in a superstate's
	116	* transition table for some character when it is known that that
	117	* character crosses more than one edge. On encountering this
	118	* instruction, the matcher saves enough state to backtrack to this
	119	* point later in the match.
	120	*/
	121	rx_backtrack_point = 0, /* data is (struct transition_class ) /
	122
	123	/*
	124	* RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
	125	* There is one occurence of this instruction per rx_distinct_future.
	126	* This instruction is skipped if a rx_distinct_future has no side effects.
	127	*/
	128	rx_do_side_effects = rx_backtrack_point + 1,
	129
	130	/* data is (struct rx_distinct_future ) /
	131
	132	/*
	133	* RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
	134	* destination superstate has been reclaimed (or was never built).
	135	* It recomputes the destination superstate.
	136	* RX_CACHE_MISS is also stored in a superstate transition table before
	137	* any of its edges have been built.
	138	*/
	139	rx_cache_miss = rx_do_side_effects + 1,
	140	/* data is (struct rx_distinct_future ) /
	141
	142	/*
	143	* RX_NEXT_CHAR is called to consume the next character and take the
	144	* corresponding transition. This is the only instruction that uses
	145	* the DATA field of the instruction frame instead of DATA_2.
	146	* The comments about rx_inx explain this further.
	147	*/
	148	rx_next_char = rx_cache_miss + 1, /* data is (struct superstate ) /
	149
	150	/* RX_BACKTRACK indicates that a transition fails. Don't
	151	* confuse this with rx_backtrack_point.
	152	*/
	153	rx_backtrack = rx_next_char + 1, /* no data */
	154
	155	/*
	156	* RX_ERROR_INX is stored only in places that should never be executed.
	157	*/
	158	rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
	159
	160	rx_num_instructions = rx_error_inx + 1
	161	};
	162
	163	/* The heart of the matcher is a `word-code-interpreter'
	164	* (like a byte-code interpreter, except that instructions
	165	* are a full word wide).
	166	*
	167	* Instructions are not stored in a vector of code, instead,
	168	* they are scattered throughout the data structures built
	169	* by the regexp compiler and the matcher. One word-code instruction,
	170	* together with the arguments to that instruction, constitute
	171	* an instruction frame (struct rx_inx).
	172	*
	173	* This structure type is padded by hand to a power of 2 because
	174	* in one of the dominant cases, we dispatch by indexing a table
	175	* of instruction frames. If that indexing can be accomplished
	176	* by just a shift of the index, we're happy.
	177	*
	178	* Instructions take at most one argument, but there are two
	179	* slots in an instruction frame that might hold that argument.
	180	* These are called data and data_2. The data slot is only
	181	* used for one instruction (RX_NEXT_CHAR). For all other
	182	* instructions, data should be set to 0.
	183	*
	184	* RX_NEXT_CHAR is the most important instruction by far.
	185	* By reserving the data field for its exclusive use,
	186	* instruction dispatch is sped up in that case. There is
	187	* no need to fetch both the instruction and the data,
	188	* only the data is needed. In other words, a `cycle' begins
	189	* by fetching the field data. If that is non-0, then it must
	190	* be the destination state of a next_char transition, so
	191	* make that value the current state, advance the match position
	192	* by one character, and start a new cycle. On the other hand,
	193	* if data is 0, fetch the instruction and do a more complicated
	194	* dispatch on that.
	195	*/
	196
	197	struct rx_inx
	198	{
	199	void * data;
	200	void * data_2;
	201	void * inx;
	202	void * fnord;
	203	};
	204
	205	#ifndef RX_TAIL_ARRAY
	206	#define RX_TAIL_ARRAY 1
	207	#endif
	208
	209	/* A superstate corresponds to a set of nfa states. Those sets are
	210	* represented by STRUCT RX_SUPERSET. The constructors
	211	* guarantee that only one (shared) structure is created for a given set.
	212	*/
	213	struct rx_superset
	214	{
	215	int refs; /* This is a reference counted structure. */
	216
	217	/* We keep these sets in a cache because (in an unpredictable way),
	218	* the same set is often created again and again.
	219	*
	220	* When an NFA is destroyed, some of the supersets for that NFA
	221	* may still exist. This can lead to false cache hits -- an apparent cache
	222	* hit on a superset that properly belongs to an already free NFA.
	223	*
	224	* When a cache hit appears to occur, we will have in hand the
	225	* nfa for which it may have happened. Every nfa is given
	226	* its own sequence number. The cache is validated
	227	* by comparing the nfa sequence number to this field:
	228	*/
	229	int id;
	230
	231	struct rx_nfa_state * car; /* May or may not be a valid addr. */
	232	struct rx_superset * cdr;
	233
	234	/* If the corresponding superstate exists: */
	235	struct rx_superstate * superstate;
	236
	237	/* That is_final field of the constiuent nfa states which has the greatest magnitude. */
	238	int is_final;
	239
	240	/* The OR of the corresponding fields of the constiuent nfa states. */
	241	int has_cset_edges;
	242
	243
	244	/* There is another bookkeeping problem. It is expensive to
	245	* compute the starting nfa state set for an nfa. So, once computed,
	246	* it is cached in the `struct rx'.
	247	*
	248	* But, the state set can be flushed from the superstate cache.
	249	* When that happens, the cached value in the `struct rx' has
	250	* to be flushed.
	251	*/
	252	struct rx * starts_for;
	253
	254	/* This is used to link into a hash bucket so these objects can
	255	* be `hash-consed'.
	256	*/
	257	struct rx_hash_item hash_item;
	258	};
	259
	260	#define rx_protect_superset(RX,CON) (++(CON)->refs)
	261
	262	/* The terminology may be confusing (rename this structure?).
	263	* Every character occurs in at most one rx_super_edge per super-state.
	264	* But, that structure might have more than one option, indicating a point
	265	* of non-determinism.
	266	*
	267	* In other words, this structure holds a list of superstate edges
	268	* sharing a common starting state and character label. The edges
	269	* are in the field OPTIONS. All superstate edges sharing the same
	270	* starting state and character are in this list.
	271	*/
	272	struct rx_super_edge
	273	{
	274	struct rx_super_edge *next;
	275	struct rx_inx rx_backtrack_frame;
	276	int cset_size;
	277	rx_Bitset cset;
	278	struct rx_distinct_future *options;
	279	};
	280
	281	/* A superstate is a set of nfa states (RX_SUPERSET) along
	282	* with a transition table. Superstates are built on demand and reclaimed
	283	* without warning. To protect a superstate from this ghastly fate,
	284	* use LOCK_SUPERSTATE.
	285	*/
	286	struct rx_superstate
	287	{
	288	int rx_id; /* c.f. the id field of rx_superset */
	289	int locks; /* protection from reclamation */
	290
	291	/* Within a superstate cache, all the superstates are kept in a big
	292	* queue. The tail of the queue is the state most likely to be
	293	* reclaimed. The *recyclable fields hold the queue position of
	294	* this state.
	295	*/
	296	struct rx_superstate * next_recyclable;
	297	struct rx_superstate * prev_recyclable;
	298
	299	/* The supernfa edges that exist in the cache and that have
	300	* this state as their destination are kept in this list:
	301	*/
	302	struct rx_distinct_future * transition_refs;
	303
	304	/* The list of nfa states corresponding to this superstate: */
	305	struct rx_superset * contents;
	306
	307	/* The list of edges in the cache beginning from this state. */
	308	struct rx_super_edge * edges;
	309
	310	/* A tail of the recyclable queue is marked as semifree. A semifree
	311	* state has no incoming next_char transitions -- any transition
	312	* into a semifree state causes a complex dispatch with the side
	313	* effect of rescuing the state from its semifree state into a
	314	* fully free state at the head of the queue.
	315	*
	316	* An alternative to this might be to make next_char more expensive,
	317	* and to move a state to the head of the recyclable queue whenever
	318	* it is entered. That way, popular states would never be recycled.
	319	*
	320	* But unilaterally making next_char more expensive actually loses.
	321	* So, incoming transitions are only made expensive for states near
	322	* the tail of the recyclable queue. The more cache contention
	323	* there is, the more frequently a state will have to prove itself
	324	* and be moved back to the front of the queue. If there is less
	325	* contention, then popular states just aggregate in the front of
	326	* the queue and stay there.
	327	*/
	328	int is_semifree;
	329
	330
	331	/* This keeps track of the size of the transition table for this
	332	* state. There is a half-hearted attempt to support variable sized
	333	* superstates.
	334	*/
	335	int trans_size;
	336
	337	/* Indexed by characters... */
	338	struct rx_inx transitions[RX_TAIL_ARRAY];
	339	};
	340
	341
	342	/* A list of distinct futures define the edges that leave from a
	343	* given superstate on a given character. c.f. rx_super_edge.
	344	*/
	345	struct rx_distinct_future
	346	{
	347	struct rx_distinct_future * next_same_super_edge[2];
	348	struct rx_distinct_future * next_same_dest;
	349	struct rx_distinct_future * prev_same_dest;
	350	struct rx_superstate * present; /* source state */
	351	struct rx_superstate * future; /* destination state */
	352	struct rx_super_edge * edge;
	353
	354
	355	/* The future_frame holds the instruction that should be executed
	356	* after all the side effects are done, when it is time to complete
	357	* the transition to the next state.
	358	*
	359	* Normally this is a next_char instruction, but it may be a
	360	* cache_miss instruction as well, depending on whether or not
	361	* the superstate is in the cache and semifree.
	362	*
	363	* If this is the only future for a given superstate/char, and
	364	* if there are no side effects to be performed, this frame is
	365	* not used (directly) at all. Instead, its contents are copied
	366	* into the transition table of the starting state of this dist. future
	367	* (a sort of goto elimination).
	368	*/
	369	struct rx_inx future_frame;
	370
	371	struct rx_inx side_effects_frame;
	372	struct rx_se_list * effects;
	373	};
	374
	375	#define rx_lock_superstate(R,S) ((S)->locks++)
	376	#define rx_unlock_superstate(R,S) (--(S)->locks)
	377	\f
	378	struct rx_cache;
	379
	380	#ifdef __STDC__
	381	typedef void (rx_morecore_fn)(struct rx_cache );
	382	#else
	383	typedef void (*rx_morecore_fn)();
	384	#endif
	385
	386	struct rx_cache
	387	{
	388	struct rx_hash_rules superset_hash_rules;
	389
	390	struct rx_superstate * lru_superstate;
	391	struct rx_superstate * semifree_superstate;
	392
	393	struct rx_superset * empty_superset;
	394
	395	int superstates;
	396	int semifree_superstates;
	397	int hits;
	398	int misses;
	399
	400	int bytes_allowed;
	401	int bytes_used;
	402
	403	int local_cset_size;
	404	void ** instruction_table;
	405
	406	struct rx_hash superset_table;
	407	};
	408
	409	#ifndef RX_DEFAULT_DFA_CACHE_SIZE
	410	/* This is an upper bound on the number of bytes that may (normally)
	411	* be allocated for DFA states. If this threshold would be exceeded,
	412	* Rx tries to flush some DFA states from the cache.
	413	*
	414	* This value is used whenever the rx_default_cache is used (for example,
	415	* with the Posix entry points).
	416	*/
	417	#define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
	418	#endif
	419
	420	extern struct rx_cache * rx_default_cache;
	421
	422	\f
	423	#ifdef __STDC__
	424	extern char * rx_cache_malloc (struct rx_cache * cache, int size);
	425	extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
	426	extern void rx_release_superset (struct rx *rx,
	427	struct rx_superset *set);
	428	extern struct rx_superset * rx_superset_cons (struct rx * rx,
	429	struct rx_nfa_state car, struct rx_superset cdr);
	430	extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset set, struct rx_nfa_state_set ecl) ;
	431	extern struct rx_superstate * rx_superstate (struct rx *rx,
	432	struct rx_superset *set);
	433	extern struct rx_inx * rx_handle_cache_miss (struct rx rx, struct rx_superstate super, unsigned char chr, void *data) ;
	434
	435	#else /* STDC */
	436	extern char * rx_cache_malloc ();
	437	extern void rx_cache_free ();
	438	extern void rx_release_superset ();
	439	extern struct rx_superset * rx_superset_cons ();
	440	extern struct rx_superset * rx_superstate_eclosure_union ();
	441	extern struct rx_superstate * rx_superstate ();
	442	extern struct rx_inx * rx_handle_cache_miss ();
	443
	444	#endif /* STDC */
	445
	446	#endif /* RXSUPERH */