diff options
Diffstat (limited to 'contrib/libg++/librx/rx.h')
-rw-r--r-- | contrib/libg++/librx/rx.h | 3732 |
1 files changed, 0 insertions, 3732 deletions
diff --git a/contrib/libg++/librx/rx.h b/contrib/libg++/librx/rx.h deleted file mode 100644 index 60f99e199e28..000000000000 --- a/contrib/libg++/librx/rx.h +++ /dev/null @@ -1,3732 +0,0 @@ -#if !defined(RXH) || defined(RX_WANT_SE_DEFS) -#define RXH - -/* Copyright (C) 1992, 1993 Free Software Foundation, Inc. - -This file is part of the librx library. - -Librx is free software; you can redistribute it and/or modify it under -the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -Librx is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU Library General Public -License along with this software; see the file COPYING.LIB. If not, -write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA -02139, USA. */ -/* t. lord Wed Sep 23 18:20:57 1992 */ - - - - - - - - -#ifndef RX_WANT_SE_DEFS - -/* This page: Bitsets */ - -#ifndef RX_subset -typedef unsigned int RX_subset; -#define RX_subset_bits (32) -#define RX_subset_mask (RX_subset_bits - 1) -#endif - -typedef RX_subset * rx_Bitset; - -#ifdef __STDC__ -typedef void (*rx_bitset_iterator) (void *, int member_index); -#else -typedef void (*rx_bitset_iterator) (); -#endif - -#define rx_bitset_subset(N) ((N) / RX_subset_bits) -#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) -#define RX_bitset_access(B,N,OP) \ - ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) -#define RX_bitset_member(B,N) RX_bitset_access(B, N, &) -#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) -#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) -#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) -#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) -#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) - - - -/* This page: Splay trees. */ - -#ifdef __STDC__ -typedef int (*rx_sp_comparer) (void * a, void * b); -#else -typedef int (*rx_sp_comparer) (); -#endif - -struct rx_sp_node -{ - void * key; - void * data; - struct rx_sp_node * kids[2]; -}; - -#ifdef __STDC__ -typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); -#else -typedef void (*rx_sp_key_data_freer) (); -#endif - - -/* giant inflatable hash trees */ - -struct rx_hash_item -{ - struct rx_hash_item * next_same_hash; - struct rx_hash * table; - unsigned long hash; - void * data; - void * binding; -}; - -struct rx_hash -{ - struct rx_hash * parent; - int refs; - struct rx_hash * children[13]; - struct rx_hash_item * buckets [13]; - int bucket_size [13]; -}; - -struct rx_hash_rules; - -#ifdef __STDC__ -/* should return like == */ -typedef int (*rx_hash_eq)(void *, void *); -typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); -typedef void (*rx_free_hash)(struct rx_hash *, - struct rx_hash_rules *); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, - void *); -typedef void (*rx_free_hash_item)(struct rx_hash_item *, - struct rx_hash_rules *); -#else -typedef int (*rx_hash_eq)(); -typedef struct rx_hash * (*rx_alloc_hash)(); -typedef void (*rx_free_hash)(); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(); -typedef void (*rx_free_hash_item)(); -#endif - -struct rx_hash_rules -{ - rx_hash_eq eq; - rx_alloc_hash hash_alloc; - rx_free_hash free_hash; - rx_alloc_hash_item hash_item_alloc; - rx_free_hash_item free_hash_item; -}; - - -/* Forward declarations */ - -struct rx_cache; -struct rx_superset; -struct rx; -struct rx_se_list; - - - -/* - * GLOSSARY - * - * regexp - * regular expression - * expression - * pattern - a `regular' expression. The expression - * need not be formally regular -- it can contain - * constructs that don't correspond to purely regular - * expressions. - * - * buffer - * string - the string (or strings) being searched or matched. - * - * pattern buffer - a structure of type `struct re_pattern_buffer' - * This in turn contains a `struct rx', which holds the - * NFA compiled from a pattern, as well as some of the state - * of a matcher using the pattern. - * - * NFA - nondeterministic finite automata. Some people - * use this term to a member of the class of - * regular automata (those corresponding to a regular - * language). However, in this code, the meaning is - * more general. The automata used by Rx are comperable - * in power to what are usually called `push down automata'. - * - * Two NFA are built by rx for every pattern. One is built - * by the compiler. The other is built from the first, on - * the fly, by the matcher. The latter is called the `superstate - * NFA' because its states correspond to sets of states from - * the first NFA. (Joe Keane gets credit for the name - * `superstate NFA'). - * - * NFA edges - * epsilon edges - * side-effect edges - The NFA compiled from a pattern can have three - * kinds of edges. Epsilon edges can be taken freely anytime - * their source state is reached. Character set edges can be - * taken when their source state is reached and when the next - * character in the buffer is a member of the set. Side effect - * edges imply a transition that can only be taken after the - * indicated side effect has been successfully accomplished. - * Some examples of side effects are: - * - * Storing the current match position to record the - * location of a parentesized subexpression. - * - * Advancing the matcher over N characters if they - * match the N characters previously matched by a - * parentesized subexpression. - * - * Both of those kinds of edges occur in the NFA generated - * by the pattern: \(.\)\1 - * - * Epsilon and side effect edges are similar. Unfortunately, - * some of the code uses the name `epsilon edge' to mean - * both epsilon and side effect edges. For example, the - * function has_non_idempotent_epsilon_path computes the existance - * of a non-trivial path containing only a mix of epsilon and - * side effect edges. In that case `nonidempotent epsilon' is being - * used to mean `side effect'. - */ - - - - - -/* LOW LEVEL PATTERN BUFFERS */ - -/* Suppose that from some NFA state, more than one path through - * side-effect edges is possible. In what order should the paths - * be tried? A function of type rx_se_list_order answers that - * question. It compares two lists of side effects, and says - * which list comes first. - */ - -#ifdef __STDC__ -typedef int (*rx_se_list_order) (struct rx *, - struct rx_se_list *, - struct rx_se_list *); -#else -typedef int (*rx_se_list_order) (); -#endif - - - -/* Struct RX holds a compiled regular expression - that is, an nfa - * ready to be converted on demand to a more efficient superstate nfa. - * This is for the low level interface. The high-level interfaces enclose - * this in a `struct re_pattern_buffer'. - */ -struct rx -{ - /* The compiler assigns a unique id to every pattern. - * Like sequence numbers in X, there is a subtle bug here - * if you use Rx in a system that runs for a long time. - * But, because of the way the caches work out, it is almost - * impossible to trigger the Rx version of this bug. - * - * The id is used to validate superstates found in a cache - * of superstates. It isn't sufficient to let a superstate - * point back to the rx for which it was compiled -- the caller - * may be re-using a `struct rx' in which case the superstate - * is not really valid. So instead, superstates are validated - * by checking the sequence number of the pattern for which - * they were built. - */ - int rx_id; - - /* This is memory mgt. state for superstates. This may be - * shared by more than one struct rx. - */ - struct rx_cache * cache; - - /* Every regex defines the size of its own character set. - * A superstate has an array of this size, with each element - * a `struct rx_inx'. So, don't make this number too large. - * In particular, don't make it 2^16. - */ - int local_cset_size; - - /* After the NFA is built, it is copied into a contiguous region - * of memory (mostly for compatability with GNU regex). - * Here is that region, and it's size: - */ - void * buffer; - unsigned long allocated; - - /* Clients of RX can ask for some extra storage in the space pointed - * to by BUFFER. The field RESERVED is an input parameter to the - * compiler. After compilation, this much space will be available - * at (buffer + allocated - reserved) - */ - unsigned long reserved; - - /* --------- The remaining fields are for internal use only. --------- */ - /* --------- But! they must be initialized to 0. --------- */ - - /* NODEC is the number of nodes in the NFA with non-epsilon - * transitions. - */ - int nodec; - - /* EPSNODEC is the number of nodes with only epsilon transitions. */ - int epsnodec; - - /* The sum (NODEC + EPSNODEC) is the total number of states in the - * compiled NFA. - */ - - /* Lists of side effects as stored in the NFA are `hash consed'..meaning - * that lists with the same elements are ==. During compilation, - * this table facilitates hash-consing. - */ - struct rx_hash se_list_memo; - - /* Lists of NFA states are also hashed. - */ - struct rx_hash set_list_memo; - - - - - /* The compiler and matcher must build a number of instruction frames. - * The format of these frames is fixed (c.f. struct rx_inx). The values - * of the instructions is not fixed. - * - * An enumerated type (enum rx_opcode) defines the set of instructions - * that the compiler or matcher might generate. When filling an instruction - * frame, the INX field is found by indexing this instruction table - * with an opcode: - */ - void ** instruction_table; - - /* The list of all states in an NFA. - * During compilation, the NEXT field of NFA states links this list. - * After compilation, all the states are compacted into an array, - * ordered by state id numbers. At that time, this points to the base - * of that array. - */ - struct rx_nfa_state *nfa_states; - - /* Every nfa begins with one distinguished starting state: - */ - struct rx_nfa_state *start; - - /* This orders the search through super-nfa paths. - * See the comment near the typedef of rx_se_list_order. - */ - rx_se_list_order se_list_cmp; - - struct rx_superset * start_set; -}; - - - - -/* SYNTAX TREES */ - -/* Compilation is in stages. - * - * In the first stage, a pattern specified by a string is - * translated into a syntax tree. Later stages will convert - * the syntax tree into an NFA optimized for conversion to a - * superstate-NFA. - * - * This page is about syntax trees. - */ - -enum rexp_node_type -{ - r_cset, /* Match from a character set. `a' or `[a-z]'*/ - r_concat, /* Concat two subexpressions. `ab' */ - r_alternate, /* Choose one of two subexpressions. `a\|b' */ - r_opt, /* Optional subexpression. `a?' */ - r_star, /* Repeated subexpression. `a*' */ - - - /* A 2phase-star is a variation on a repeated subexpression. - * In this case, there are two subexpressions. The first, if matched, - * begins a repitition (otherwise, the whole expression is matches the - * empth string). - * - * After matching the first subexpression, a 2phase star either finishes, - * or matches the second subexpression. If the second subexpression is - * matched, then the whole construct repeats. - * - * 2phase stars are used in two circumstances. First, they - * are used as part of the implementation of POSIX intervals (counted - * repititions). Second, they are used to implement proper star - * semantics when the repeated subexpression contains paths of - * only side effects. See rx_compile for more information. - */ - r_2phase_star, - - - /* c.f. "typedef void * rx_side_effect" */ - r_side_effect, - - /* This is an extension type: It is for transient use in source->source - * transformations (implemented over syntax trees). - */ - r_data -}; - -/* A side effect is a matcher-specific action associated with - * transitions in the NFA. The details of side effects are up - * to the matcher. To the compiler and superstate constructors - * side effects are opaque: - */ - -typedef void * rx_side_effect; - -/* Nodes in a syntax tree are of this type: - */ -struct rexp_node -{ - enum rexp_node_type type; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - struct - { - struct rexp_node *left; - struct rexp_node *right; - } pair; - void * data; - } params; -}; - - - -/* NFA - * - * A syntax tree is compiled into an NFA. This page defines the structure - * of that NFA. - */ - -struct rx_nfa_state -{ - /* These are kept in a list as the NFA is being built. */ - struct rx_nfa_state *next; - - /* After the NFA is built, states are given integer id's. - * States whose outgoing transitions are all either epsilon or - * side effect edges are given ids less than 0. Other states - * are given successive non-negative ids starting from 0. - */ - int id; - - /* The list of NFA edges that go from this state to some other. */ - struct rx_nfa_edge *edges; - - /* If you land in this state, then you implicitly land - * in all other states reachable by only epsilon translations. - * Call the set of maximal paths to such states the epsilon closure - * of this state. - * - * There may be other states that are reachable by a mixture of - * epsilon and side effect edges. Consider the set of maximal paths - * of that sort from this state. Call it the epsilon-side-effect - * closure of the state. - * - * The epsilon closure of the state is a subset of the epsilon-side- - * effect closure. It consists of all the paths that contain - * no side effects -- only epsilon edges. - * - * The paths in the epsilon-side-effect closure can be partitioned - * into equivalance sets. Two paths are equivalant if they have the - * same set of side effects, in the same order. The epsilon-closure - * is one of these equivalance sets. Let's call these equivalance - * sets: observably equivalant path sets. That name is chosen - * because equivalance of two paths means they cause the same side - * effects -- so they lead to the same subsequent observations other - * than that they may wind up in different target states. - * - * The superstate nfa, which is derived from this nfa, is based on - * the observation that all of the paths in an observably equivalant - * path set can be explored at the same time, provided that the - * matcher keeps track not of a single nfa state, but of a set of - * states. In particular, after following all the paths in an - * observably equivalant set, you wind up at a set of target states. - * That set of target states corresponds to one state in the - * superstate NFA. - * - * Staticly, before matching begins, it is convenient to analyze the - * nfa. Each state is labeled with a list of the observably - * equivalant path sets who's union covers all the - * epsilon-side-effect paths beginning in this state. This list is - * called the possible futures of the state. - * - * A trivial example is this NFA: - * s1 - * A ---> B - * - * s2 - * ---> C - * - * epsilon s1 - * ---------> D ------> E - * - * - * In this example, A has two possible futures. - * One invokes the side effect `s1' and contains two paths, - * one ending in state B, the other in state E. - * The other invokes the side effect `s2' and contains only - * one path, landing in state C. - */ - struct rx_possible_future *futures; - - - /* There are exactly two distinguished states in every NFA: */ - unsigned int is_final:1; - unsigned int is_start:1; - - /* These are used during NFA construction... */ - unsigned int eclosure_needed:1; - unsigned int mark:1; -}; - - -/* An edge in an NFA is typed: */ -enum rx_nfa_etype -{ - /* A cset edge is labled with a set of characters one of which - * must be matched for the edge to be taken. - */ - ne_cset, - - /* An epsilon edge is taken whenever its starting state is - * reached. - */ - ne_epsilon, - - /* A side effect edge is taken whenever its starting state is - * reached. Side effects may cause the match to fail or the - * position of the matcher to advance. - */ - ne_side_effect /* A special kind of epsilon. */ -}; - -struct rx_nfa_edge -{ - struct rx_nfa_edge *next; - enum rx_nfa_etype type; - struct rx_nfa_state *dest; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - } params; -}; - - - -/* A possible future consists of a list of side effects - * and a set of destination states. Below are their - * representations. These structures are hash-consed which - * means that lists with the same elements share a representation - * (their addresses are ==). - */ - -struct rx_nfa_state_set -{ - struct rx_nfa_state * car; - struct rx_nfa_state_set * cdr; -}; - -struct rx_se_list -{ - rx_side_effect car; - struct rx_se_list * cdr; -}; - -struct rx_possible_future -{ - struct rx_possible_future *next; - struct rx_se_list * effects; - struct rx_nfa_state_set * destset; -}; - - - -/* This begins the description of the superstate NFA. - * - * The superstate NFA corresponds to the NFA in these ways: - * - * Every superstate NFA states SUPER correspond to sets of NFA states, - * nfa_states(SUPER). - * - * Superstate edges correspond to NFA paths. - * - * The superstate has no epsilon transitions; - * every edge has a character label, and a (possibly empty) side - * effect label. The side effect label corresponds to a list of - * side effects that occur in the NFA. These parts are referred - * to as: superedge_character(EDGE) and superedge_sides(EDGE). - * - * For a superstate edge EDGE starting in some superstate SUPER, - * the following is true (in pseudo-notation :-): - * - * exists DEST in nfa_states s.t. - * exists nfaEDGE in nfa_edges s.t. - * origin (nfaEDGE) == DEST - * && origin (nfaEDGE) is a member of nfa_states(SUPER) - * && exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * - * also: - * - * let SUPER2 := superedge_destination(EDGE) - * nfa_states(SUPER2) - * == union of all nfa state sets S s.t. - * exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * && S == dests_of_possible_future (PF) } - * - * Or in english, every superstate is a set of nfa states. A given - * character and a superstate implies many transitions in the NFA -- - * those that begin with an edge labeled with that character from a - * state in the set corresponding to the superstate. - * - * The destinations of those transitions each have a set of possible - * futures. A possible future is a list of side effects and a set of - * destination NFA states. Two sets of possible futures can be - * `merged' by combining all pairs of possible futures that have the - * same side effects. A pair is combined by creating a new future - * with the same side effect but the union of the two destination sets. - * In this way, all the possible futures suggested by a superstate - * and a character can be merged into a set of possible futures where - * no two elements of the set have the same set of side effects. - * - * The destination of a possible future, being a set of NFA states, - * corresponds to a supernfa state. So, the merged set of possible - * futures we just created can serve as a set of edges in the - * supernfa. - * - * The representation of the superstate nfa and the nfa is critical. - * The nfa has to be compact, but has to facilitate the rapid - * computation of missing superstates. The superstate nfa has to - * be fast to interpret, lazilly constructed, and bounded in space. - * - * To facilitate interpretation, the superstate data structures are - * peppered with `instruction frames'. There is an instruction set - * defined below which matchers using the supernfa must be able to - * interpret. - * - * We'd like to make it possible but not mandatory to use code - * addresses to represent instructions (c.f. gcc's computed goto). - * Therefore, we define an enumerated type of opcodes, and when - * writing one of these instructions into a data structure, use - * the opcode as an index into a table of instruction values. - * - * Here are the opcodes that occur in the superstate nfa: - */ - - -/* Every superstate contains a table of instruction frames indexed - * by characters. A normal `move' in a matcher is to fetch the next - * character and use it as an index into a superstates transition - * table. - * - * In the fasted case, only one edge follows from that character. - * In other cases there is more work to do. - * - * The descriptions of the opcodes refer to data structures that are - * described further below. - */ - -enum rx_opcode -{ - /* - * BACKTRACK_POINT is invoked when a character transition in - * a superstate leads to more than one edge. In that case, - * the edges have to be explored independently using a backtracking - * strategy. - * - * A BACKTRACK_POINT instruction is stored in a superstate's - * transition table for some character when it is known that that - * character crosses more than one edge. On encountering this - * instruction, the matcher saves enough state to backtrack to this - * point in the match later. - */ - rx_backtrack_point = 0, /* data is (struct transition_class *) */ - - /* - * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. - * There is one occurence of this instruction per rx_distinct_future. - * This instruction is skipped if a rx_distinct_future has no side effects. - */ - rx_do_side_effects = rx_backtrack_point + 1, - - /* data is (struct rx_distinct_future *) */ - - /* - * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose - * destination superstate has been reclaimed (or was never built). - * It recomputes the destination superstate. - * RX_CACHE_MISS is also stored in a superstate transition table before - * any of its edges have been built. - */ - rx_cache_miss = rx_do_side_effects + 1, - /* data is (struct rx_distinct_future *) */ - - /* - * RX_NEXT_CHAR is called to consume the next character and take the - * corresponding transition. This is the only instruction that uses - * the DATA field of the instruction frame instead of DATA_2. - * (see EXPLORE_FUTURE in regex.c). - */ - rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ - - /* RX_BACKTRACK indicates that a transition fails. - */ - rx_backtrack = rx_next_char + 1, /* no data */ - - /* - * RX_ERROR_INX is stored only in places that should never be executed. - */ - rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ - - rx_num_instructions = rx_error_inx + 1 -}; - -/* An id_instruction_table holds the values stored in instruction - * frames. The table is indexed by the enums declared above. - */ -extern void * rx_id_instruction_table[rx_num_instructions]; - -/* The heart of the matcher is a `word-code-interpreter' - * (like a byte-code interpreter, except that instructions - * are a full word wide). - * - * Instructions are not stored in a vector of code, instead, - * they are scattered throughout the data structures built - * by the regexp compiler and the matcher. One word-code instruction, - * together with the arguments to that instruction, constitute - * an instruction frame (struct rx_inx). - * - * This structure type is padded by hand to a power of 2 because - * in one of the dominant cases, we dispatch by indexing a table - * of instruction frames. If that indexing can be accomplished - * by just a shift of the index, we're happy. - * - * Instructions take at most one argument, but there are two - * slots in an instruction frame that might hold that argument. - * These are called data and data_2. The data slot is only - * used for one instruction (RX_NEXT_CHAR). For all other - * instructions, data should be set to 0. - * - * RX_NEXT_CHAR is the most important instruction by far. - * By reserving the data field for its exclusive use, - * instruction dispatch is sped up in that case. There is - * no need to fetch both the instruction and the data, - * only the data is needed. In other words, a `cycle' begins - * by fetching the field data. If that is non-0, then it must - * be the destination state of a next_char transition, so - * make that value the current state, advance the match position - * by one character, and start a new cycle. On the other hand, - * if data is 0, fetch the instruction and do a more complicated - * dispatch on that. - */ - -struct rx_inx -{ - void * data; - void * data_2; - void * inx; - void * fnord; -}; - -#ifndef RX_TAIL_ARRAY -#define RX_TAIL_ARRAY 1 -#endif - -/* A superstate corresponds to a set of nfa states. Those sets are - * represented by STRUCT RX_SUPERSET. The constructors - * guarantee that only one (shared) structure is created for a given set. - */ -struct rx_superset -{ - int refs; /* This is a reference counted structure. */ - - /* We keep these sets in a cache because (in an unpredictable way), - * the same set is often created again and again. But that is also - * problematic -- compatibility with POSIX and GNU regex requires - * that we not be able to tell when a program discards a particular - * NFA (thus invalidating the supersets created from it). - * - * But when a cache hit appears to occur, we will have in hand the - * nfa for which it may have happened. That is why every nfa is given - * its own sequence number. On a cache hit, the cache is validated - * by comparing the nfa sequence number to this field: - */ - int id; - - struct rx_nfa_state * car; /* May or may not be a valid addr. */ - struct rx_superset * cdr; - - /* If the corresponding superstate exists: */ - struct rx_superstate * superstate; - - - /* There is another bookkeeping problem. It is expensive to - * compute the starting nfa state set for an nfa. So, once computed, - * it is cached in the `struct rx'. - * - * But, the state set can be flushed from the superstate cache. - * When that happens, we can't know if the corresponding `struct rx' - * is still alive or if it has been freed or re-used by the program. - * So, the cached pointer to this set in a struct rx might be invalid - * and we need a way to validate it. - * - * Fortunately, even if this set is flushed from the cache, it is - * not freed. It just goes on the free-list of supersets. - * So we can still examine it. - * - * So to validate a starting set memo, check to see if the - * starts_for field still points back to the struct rx in question, - * and if the ID matches the rx sequence number. - */ - struct rx * starts_for; - - /* This is used to link into a hash bucket so these objects can - * be `hash-consed'. - */ - struct rx_hash_item hash_item; -}; - -#define rx_protect_superset(RX,CON) (++(CON)->refs) - -/* The terminology may be confusing (rename this structure?). - * Every character occurs in at most one rx_super_edge per super-state. - * But, that structure might have more than one option, indicating a point - * of non-determinism. - * - * In other words, this structure holds a list of superstate edges - * sharing a common starting state and character label. The edges - * are in the field OPTIONS. All superstate edges sharing the same - * starting state and character are in this list. - */ -struct rx_super_edge -{ - struct rx_super_edge *next; - struct rx_inx rx_backtrack_frame; - int cset_size; - rx_Bitset cset; - struct rx_distinct_future *options; -}; - -/* A superstate is a set of nfa states (RX_SUPERSET) along - * with a transition table. Superstates are built on demand and reclaimed - * without warning. To protect a superstate from this ghastly fate, - * use LOCK_SUPERSTATE. - */ -struct rx_superstate -{ - int rx_id; /* c.f. the id field of rx_superset */ - int locks; /* protection from reclamation */ - - /* Within a superstate cache, all the superstates are kept in a big - * queue. The tail of the queue is the state most likely to be - * reclaimed. The *recyclable fields hold the queue position of - * this state. - */ - struct rx_superstate * next_recyclable; - struct rx_superstate * prev_recyclable; - - /* The supernfa edges that exist in the cache and that have - * this state as their destination are kept in this list: - */ - struct rx_distinct_future * transition_refs; - - /* The list of nfa states corresponding to this superstate: */ - struct rx_superset * contents; - - /* The list of edges in the cache beginning from this state. */ - struct rx_super_edge * edges; - - /* A tail of the recyclable queue is marked as semifree. A semifree - * state has no incoming next_char transitions -- any transition - * into a semifree state causes a complex dispatch with the side - * effect of rescuing the state from its semifree state. - * - * An alternative to this might be to make next_char more expensive, - * and to move a state to the head of the recyclable queue whenever - * it is entered. That way, popular states would never be recycled. - * - * But unilaterally making next_char more expensive actually loses. - * So, incoming transitions are only made expensive for states near - * the tail of the recyclable queue. The more cache contention - * there is, the more frequently a state will have to prove itself - * and be moved back to the front of the queue. If there is less - * contention, then popular states just aggregate in the front of - * the queue and stay there. - */ - int is_semifree; - - - /* This keeps track of the size of the transition table for this - * state. There is a half-hearted attempt to support variable sized - * superstates. - */ - int trans_size; - - /* Indexed by characters... */ - struct rx_inx transitions[RX_TAIL_ARRAY]; -}; - - -/* A list of distinct futures define the edges that leave from a - * given superstate on a given character. c.f. rx_super_edge. - */ - -struct rx_distinct_future -{ - struct rx_distinct_future * next_same_super_edge[2]; - struct rx_distinct_future * next_same_dest; - struct rx_distinct_future * prev_same_dest; - struct rx_superstate * present; /* source state */ - struct rx_superstate * future; /* destination state */ - struct rx_super_edge * edge; - - - /* The future_frame holds the instruction that should be executed - * after all the side effects are done, when it is time to complete - * the transition to the next state. - * - * Normally this is a next_char instruction, but it may be a - * cache_miss instruction as well, depending on whether or not - * the superstate is in the cache and semifree. - * - * If this is the only future for a given superstate/char, and - * if there are no side effects to be performed, this frame is - * not used (directly) at all. Instead, its contents are copied - * into the transition table of the starting state of this dist. future. - */ - struct rx_inx future_frame; - - struct rx_inx side_effects_frame; - struct rx_se_list * effects; -}; - -#define rx_lock_superstate(R,S) ((S)->locks++) -#define rx_unlock_superstate(R,S) (--(S)->locks) - - -/* This page destined for rx.h */ - -struct rx_blocklist -{ - struct rx_blocklist * next; - int bytes; -}; - -struct rx_freelist -{ - struct rx_freelist * next; -}; - -struct rx_cache; - -#ifdef __STDC__ -typedef void (*rx_morecore_fn)(struct rx_cache *); -#else -typedef void (*rx_morecore_fn)(); -#endif - -/* You use this to control the allocation of superstate data - * during matching. Most of it should be initialized to 0. - * - * A MORECORE function is necessary. It should allocate - * a new block of memory or return 0. - * A default that uses malloc is called `rx_morecore'. - * - * The number of SUPERSTATES_ALLOWED indirectly limits how much memory - * the system will try to allocate. The default is 128. Batch style - * applications that are very regexp intensive should use as high a number - * as possible without thrashing. - * - * The LOCAL_CSET_SIZE is the number of characters in a character set. - * It is therefore the number of entries in a superstate transition table. - * Generally, it should be 256. If your character set has 16 bits, - * it is better to translate your regexps into equivalent 8 bit patterns. - */ - -struct rx_cache -{ - struct rx_hash_rules superset_hash_rules; - - /* Objects are allocated by incrementing a pointer that - * scans across rx_blocklists. - */ - struct rx_blocklist * memory; - struct rx_blocklist * memory_pos; - int bytes_left; - char * memory_addr; - rx_morecore_fn morecore; - - /* Freelists. */ - struct rx_freelist * free_superstates; - struct rx_freelist * free_transition_classes; - struct rx_freelist * free_discernable_futures; - struct rx_freelist * free_supersets; - struct rx_freelist * free_hash; - - /* Two sets of superstates -- those that are semifreed, and those - * that are being used. - */ - struct rx_superstate * lru_superstate; - struct rx_superstate * semifree_superstate; - - struct rx_superset * empty_superset; - - int superstates; - int semifree_superstates; - int hits; - int misses; - int superstates_allowed; - - int local_cset_size; - void ** instruction_table; - - struct rx_hash superset_table; -}; - - - -/* The lowest-level search function supports arbitrarily fragmented - * strings and (optionally) suspendable/resumable searches. - * - * Callers have to provide a few hooks. - */ - -#ifndef __GNUC__ -#ifdef __STDC__ -#define __const__ const -#else -#define __const__ -#endif -#endif - -/* This holds a matcher position */ -struct rx_string_position -{ - __const__ unsigned char * pos; /* The current pos. */ - __const__ unsigned char * string; /* The current string burst. */ - __const__ unsigned char * end; /* First invalid position >= POS. */ - int offset; /* Integer address of the current burst. */ - int size; /* Current string's size. */ - int search_direction; /* 1 or -1 */ - int search_end; /* First position to not try. */ -}; - - -enum rx_get_burst_return -{ - rx_get_burst_continuation, - rx_get_burst_error, - rx_get_burst_ok, - rx_get_burst_no_more -}; - - -/* A call to get burst should make POS valid. It might be invalid - * if the STRING field doesn't point to a burst that actually - * contains POS. - * - * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to - * whether or not to pad to the left. Padding to the right is always - * appropriate, but need not go past the point indicated by STOP. - * - * If a continuation is returned, then the reentering call to - * a search function will retry the get_burst. - */ - -#ifdef __STDC__ -typedef enum rx_get_burst_return - (*rx_get_burst_fn) (struct rx_string_position * pos, - void * app_closure, - int stop); - -#else -typedef enum rx_get_burst_return (*rx_get_burst_fn) (); -#endif - - -enum rx_back_check_return -{ - rx_back_check_continuation, - rx_back_check_error, - rx_back_check_pass, - rx_back_check_fail -}; - -/* Back_check should advance the position it is passed - * over rparen - lparen characters and return pass iff - * the characters starting at POS match those indexed - * by [LPAREN..RPAREN]. - * - * If a continuation is returned, then the reentering call to - * a search function will retry the back_check. - */ - -#ifdef __STDC__ -typedef enum rx_back_check_return - (*rx_back_check_fn) (struct rx_string_position * pos, - int lparen, - int rparen, - unsigned char * translate, - void * app_closure, - int stop); - -#else -typedef enum rx_back_check_return (*rx_back_check_fn) (); -#endif - - - - -/* A call to fetch_char should return the character at POS or POS + 1. - * Returning continuations here isn't supported. OFFSET is either 0 or 1 - * and indicates which characters is desired. - */ - -#ifdef __STDC__ -typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos, - int offset, - void * app_closure, - int stop); -#else -typedef int (*rx_fetch_char_fn) (); -#endif - - -enum rx_search_return -{ - rx_search_continuation = -4, - rx_search_error = -3, - rx_search_soft_fail = -2, /* failed by running out of string */ - rx_search_fail = -1 /* failed only by reaching failure states */ - /* return values >= 0 indicate the position of a successful match */ -}; - - - - - - -/* regex.h - * - * The remaining declarations replace regex.h. - */ - -/* This is an array of error messages corresponding to the error codes. - */ -extern __const__ char *re_error_msg[]; - -/* If any error codes are removed, changed, or added, update the - `re_error_msg' table in regex.c. */ -typedef enum -{ - REG_NOERROR = 0, /* Success. */ - REG_NOMATCH, /* Didn't find a match (for regexec). */ - - /* POSIX regcomp return error codes. (In the order listed in the - standard.) */ - REG_BADPAT, /* Invalid pattern. */ - REG_ECOLLATE, /* Not implemented. */ - REG_ECTYPE, /* Invalid character class name. */ - REG_EESCAPE, /* Trailing backslash. */ - REG_ESUBREG, /* Invalid back reference. */ - REG_EBRACK, /* Unmatched left bracket. */ - REG_EPAREN, /* Parenthesis imbalance. */ - REG_EBRACE, /* Unmatched \{. */ - REG_BADBR, /* Invalid contents of \{\}. */ - REG_ERANGE, /* Invalid range end. */ - REG_ESPACE, /* Ran out of memory. */ - REG_BADRPT, /* No preceding re for repetition op. */ - - /* Error codes we've added. */ - REG_EEND, /* Premature end. */ - REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ - REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ -} reg_errcode_t; - -/* The regex.c support, as a client of rx, defines a set of possible - * side effects that can be added to the edge lables of nfa edges. - * Here is the list of sidef effects in use. - */ - -enum re_side_effects -{ -#define RX_WANT_SE_DEFS 1 -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE, -#define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE, -#include "rx.h" -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#undef RX_WANT_SE_DEFS - re_floogle_flap = 65533 -}; - -/* These hold paramaters for the kinds of side effects that are possible - * in the supported pattern languages. These include things like the - * numeric bounds of {} operators and the index of paren registers for - * subexpression measurement or backreferencing. - */ -struct re_se_params -{ - enum re_side_effects se; - int op1; - int op2; -}; - -typedef unsigned reg_syntax_t; - -struct re_pattern_buffer -{ - struct rx rx; - reg_syntax_t syntax; /* See below for syntax bit definitions. */ - - unsigned int no_sub:1; /* If set, don't return register offsets. */ - unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */ - unsigned int not_eol:1; /* match at the ends of the string. */ - unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/ - unsigned int least_subs:1; /* If set, and returning registers, return - * as few values as possible. Only - * backreferenced groups and group 0 (the whole - * match) will be returned. - */ - - /* If true, this says that the matcher should keep registers on its - * backtracking stack. For many patterns, we can easily determine that - * this isn't necessary. - */ - unsigned int match_regs_on_stack:1; - unsigned int search_regs_on_stack:1; - - /* is_anchored and begbuf_only are filled in by rx_compile. */ - unsigned int is_anchored:1; /* Anchorded by ^? */ - unsigned int begbuf_only:1; /* Anchored to char position 0? */ - - - /* If REGS_UNALLOCATED, allocate space in the `regs' structure - * for `max (RE_NREGS, re_nsub + 1)' groups. - * If REGS_REALLOCATE, reallocate space if necessary. - * If REGS_FIXED, use what's there. - */ -#define REGS_UNALLOCATED 0 -#define REGS_REALLOCATE 1 -#define REGS_FIXED 2 - unsigned int regs_allocated:2; - - - /* Either a translate table to apply to all characters before - * comparing them, or zero for no translation. The translation - * is applied to a pattern when it is compiled and to a string - * when it is matched. - */ - unsigned char * translate; - - /* If this is a valid pointer, it tells rx not to store the extents of - * certain subexpressions (those corresponding to non-zero entries). - * Passing 0x1 is the same as passing an array of all ones. Passing 0x0 - * is the same as passing an array of all zeros. - * The array should contain as many entries as their are subexps in the - * regexp. - * - * For POSIX compatability, when using regcomp and regexec this field - * is zeroed and ignored. - */ - char * syntax_parens; - - /* Number of subexpressions found by the compiler. */ - size_t re_nsub; - - void * buffer; /* Malloced memory for the nfa. */ - unsigned long allocated; /* Size of that memory. */ - - /* Pointer to a fastmap, if any, otherwise zero. re_search uses - * the fastmap, if there is one, to skip over impossible - * starting points for matches. */ - char *fastmap; - - unsigned int fastmap_accurate:1; /* These three are internal. */ - unsigned int can_match_empty:1; - struct rx_nfa_state * start; /* The nfa starting state. */ - - /* This is the list of iterator bounds for {lo,hi} constructs. - * The memory pointed to is part of the rx->buffer. - */ - struct re_se_params *se_params; - - /* This is a bitset representation of the fastmap. - * This is a true fastmap that already takes the translate - * table into account. - */ - rx_Bitset fastset; -}; - -/* Type for byte offsets within the string. POSIX mandates this. */ -typedef int regoff_t; - -/* This is the structure we store register match data in. See - regex.texinfo for a full description of what registers match. */ -struct re_registers -{ - unsigned num_regs; - regoff_t *start; - regoff_t *end; -}; - -typedef struct re_pattern_buffer regex_t; - -/* POSIX specification for registers. Aside from the different names than - `re_registers', POSIX uses an array of structures, instead of a - structure of arrays. */ -typedef struct -{ - regoff_t rm_so; /* Byte offset from string's start to substring's start. */ - regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ -} regmatch_t; - - -/* The following bits are used to determine the regexp syntax we - recognize. The set/not-set meanings are chosen so that Emacs syntax - remains the value 0. The bits are given in alphabetical order, and - the definitions shifted by one from the previous bit; thus, when we - add or remove a bit, only one other definition need change. */ - -/* If this bit is not set, then \ inside a bracket expression is literal. - If set, then such a \ quotes the following character. */ -#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) - -/* If this bit is not set, then + and ? are operators, and \+ and \? are - literals. - If set, then \+ and \? are operators and + and ? are literals. */ -#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) - -/* If this bit is set, then character classes are supported. They are: - [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], - [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. - If not set, then character classes are not supported. */ -#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) - -/* If this bit is set, then ^ and $ are always anchors (outside bracket - expressions, of course). - If this bit is not set, then it depends: - ^ is an anchor if it is at the beginning of a regular - expression or after an open-group or an alternation operator; - $ is an anchor if it is at the end of a regular expression, or - before a close-group or an alternation operator. - - This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because - POSIX draft 11.2 says that * etc. in leading positions is undefined. - We already implemented a previous draft which made those constructs - invalid, though, so we haven't changed the code back. */ -#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) - -/* If this bit is set, then special characters are always special - regardless of where they are in the pattern. - If this bit is not set, then special characters are special only in - some contexts; otherwise they are ordinary. Specifically, - * + ? and intervals are only special when not after the beginning, - open-group, or alternation operator. */ -#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) - -/* If this bit is set, then *, +, ?, and { cannot be first in an re or - immediately after an alternation or begin-group operator. */ -#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) - -/* If this bit is set, then . matches newline. - If not set, then it doesn't. */ -#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) - -/* If this bit is set, then . doesn't match NUL. - If not set, then it does. */ -#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) - -/* If this bit is set, nonmatching lists [^...] do not match newline. - If not set, they do. */ -#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) - -/* If this bit is set, either \{...\} or {...} defines an - interval, depending on RE_NO_BK_BRACES. - If not set, \{, \}, {, and } are literals. */ -#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) - -/* If this bit is set, +, ? and | aren't recognized as operators. - If not set, they are. */ -#define RE_LIMITED_OPS (RE_INTERVALS << 1) - -/* If this bit is set, newline is an alternation operator. - If not set, newline is literal. */ -#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) - -/* If this bit is set, then `{...}' defines an interval, and \{ and \} - are literals. - If not set, then `\{...\}' defines an interval. */ -#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) - -/* If this bit is set, (...) defines a group, and \( and \) are literals. - If not set, \(...\) defines a group, and ( and ) are literals. */ -#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) - -/* If this bit is set, then \<digit> matches <digit>. - If not set, then \<digit> is a back-reference. */ -#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) - -/* If this bit is set, then | is an alternation operator, and \| is literal. - If not set, then \| is an alternation operator, and | is literal. */ -#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) - -/* If this bit is set, then an ending range point collating higher - than the starting range point, as in [z-a], is invalid. - If not set, then when ending range point collates higher than the - starting range point, the range is ignored. */ -#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) - -/* If this bit is set, then an unmatched ) is ordinary. - If not set, then an unmatched ) is invalid. */ -#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) - -/* This global variable defines the particular regexp syntax to use (for - some interfaces). When a regexp is compiled, the syntax used is - stored in the pattern buffer, so changing this does not affect - already-compiled regexps. */ -extern reg_syntax_t re_syntax_options; - -/* Define combinations of the above bits for the standard possibilities. - (The [[[ comments delimit what gets put into the Texinfo file, so - don't delete them!) */ -/* [[[begin syntaxes]]] */ -#define RE_SYNTAX_EMACS 0 - -#define RE_SYNTAX_AWK \ - (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ - | RE_UNMATCHED_RIGHT_PAREN_ORD) - -#define RE_SYNTAX_POSIX_AWK \ - (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) - -#define RE_SYNTAX_GREP \ - (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ - | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ - | RE_NEWLINE_ALT) - -#define RE_SYNTAX_EGREP \ - (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ - | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ - | RE_NO_BK_VBAR) - -#define RE_SYNTAX_POSIX_EGREP \ - (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) - -#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC - -/* Syntax bits common to both basic and extended POSIX regex syntax. */ -#define _RE_SYNTAX_POSIX_COMMON \ - (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ - | RE_INTERVALS | RE_NO_EMPTY_RANGES) - -#define RE_SYNTAX_POSIX_BASIC \ - (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) - -/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes - RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this - isn't minimal, since other operators, such as \`, aren't disabled. */ -#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ - (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) - -#define RE_SYNTAX_POSIX_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ - | RE_UNMATCHED_RIGHT_PAREN_ORD) - -/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS - replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ -#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) -/* [[[end syntaxes]]] */ - -/* Maximum number of duplicates an interval can allow. Some systems - (erroneously) define this in other header files, but we want our - value, so remove any previous define. */ -#ifdef RE_DUP_MAX -#undef RE_DUP_MAX -#endif -#define RE_DUP_MAX ((1 << 15) - 1) - - - -/* POSIX `cflags' bits (i.e., information for `regcomp'). */ - -/* If this bit is set, then use extended regular expression syntax. - If not set, then use basic regular expression syntax. */ -#define REG_EXTENDED 1 - -/* If this bit is set, then ignore case when matching. - If not set, then case is significant. */ -#define REG_ICASE (REG_EXTENDED << 1) - -/* If this bit is set, then anchors do not match at newline - characters in the string. - If not set, then anchors do match at newlines. */ -#define REG_NEWLINE (REG_ICASE << 1) - -/* If this bit is set, then report only success or fail in regexec. - If not set, then returns differ between not matching and errors. */ -#define REG_NOSUB (REG_NEWLINE << 1) - - -/* POSIX `eflags' bits (i.e., information for regexec). */ - -/* If this bit is set, then the beginning-of-line operator doesn't match - the beginning of the string (presumably because it's not the - beginning of a line). - If not set, then the beginning-of-line operator does match the - beginning of the string. */ -#define REG_NOTBOL 1 - -/* Like REG_NOTBOL, except for the end-of-line. */ -#define REG_NOTEOL (1 << 1) - -/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, - * `re_match_2' returns information about at least this many registers - * the first time a `regs' structure is passed. - * - * Also, this is the greatest number of backreferenced subexpressions - * allowed in a pattern being matched without caller-supplied registers. - */ -#ifndef RE_NREGS -#define RE_NREGS 30 -#endif - -extern int rx_cache_bound; -extern char rx_version_string[]; - - - -#ifdef RX_WANT_RX_DEFS - -/* This is decls to the interesting subsystems and lower layers - * of rx. Everything which doesn't have a public counterpart in - * regex.c is declared here. - */ - - -#ifdef __STDC__ -typedef void (*rx_hash_freefn) (struct rx_hash_item * it); -#else /* ndef __STDC__ */ -typedef void (*rx_hash_freefn) (); -#endif /* ndef __STDC__ */ - - - - -#ifdef __STDC__ -RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b); -RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b); -RX_DECL int rx_bitset_empty (int size, rx_Bitset set); -RX_DECL void rx_bitset_null (int size, rx_Bitset b); -RX_DECL void rx_bitset_universe (int size, rx_Bitset b); -RX_DECL void rx_bitset_complement (int size, rx_Bitset b); -RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_intersection (int size, - rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_revdifference (int size, - rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b); -RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b); -RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table, - unsigned long hash, - void * value, - struct rx_hash_rules * rules); -RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table, - unsigned long hash, - void * value, - struct rx_hash_rules * rules); -RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules); -RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn, - struct rx_hash_rules * rules); -RX_DECL rx_Bitset rx_cset (struct rx *rx); -RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a); -RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c); -RX_DECL struct rexp_node * rexp_node (struct rx *rx, - enum rexp_node_type type); -RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx, - rx_Bitset b); -RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx, - struct rexp_node * a); -RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx, - struct rexp_node * a); -RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx, - rx_side_effect a); -RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx, - void * a); -RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node); -RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx, - struct rexp_node *node); -RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx); -RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n); -RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx, - int id); -RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx, - enum rx_nfa_etype type, - struct rx_nfa_state *start, - struct rx_nfa_state *dest); -RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e); -RX_DECL void rx_free_nfa (struct rx *rx); -RX_DECL int rx_build_nfa (struct rx *rx, - struct rexp_node *rexp, - struct rx_nfa_state **start, - struct rx_nfa_state **end); -RX_DECL void rx_name_nfa_states (struct rx *rx); -RX_DECL int rx_eclose_nfa (struct rx *rx); -RX_DECL void rx_delete_epsilon_transitions (struct rx *rx); -RX_DECL int rx_compactify_nfa (struct rx *rx, - void **mem, unsigned long *size); -RX_DECL void rx_release_superset (struct rx *rx, - struct rx_superset *set); -RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx, - struct rx_nfa_state *car, struct rx_superset *cdr); -RX_DECL struct rx_superset * rx_superstate_eclosure_union - (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl); -RX_DECL struct rx_superstate * rx_superstate (struct rx *rx, - struct rx_superset *set); -RX_DECL struct rx_inx * rx_handle_cache_miss - (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data); -RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size, - reg_syntax_t syntax, - struct re_pattern_buffer * rxb); -RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb); -#else /* STDC */ -RX_DECL int rx_bitset_is_equal (); -RX_DECL int rx_bitset_is_subset (); -RX_DECL int rx_bitset_empty (); -RX_DECL void rx_bitset_null (); -RX_DECL void rx_bitset_universe (); -RX_DECL void rx_bitset_complement (); -RX_DECL void rx_bitset_assign (); -RX_DECL void rx_bitset_union (); -RX_DECL void rx_bitset_intersection (); -RX_DECL void rx_bitset_difference (); -RX_DECL void rx_bitset_revdifference (); -RX_DECL void rx_bitset_xor (); -RX_DECL unsigned long rx_bitset_hash (); -RX_DECL struct rx_hash_item * rx_hash_find (); -RX_DECL struct rx_hash_item * rx_hash_store (); -RX_DECL void rx_hash_free (); -RX_DECL void rx_free_hash_table (); -RX_DECL rx_Bitset rx_cset (); -RX_DECL rx_Bitset rx_copy_cset (); -RX_DECL void rx_free_cset (); -RX_DECL struct rexp_node * rexp_node (); -RX_DECL struct rexp_node * rx_mk_r_cset (); -RX_DECL struct rexp_node * rx_mk_r_concat (); -RX_DECL struct rexp_node * rx_mk_r_alternate (); -RX_DECL struct rexp_node * rx_mk_r_opt (); -RX_DECL struct rexp_node * rx_mk_r_star (); -RX_DECL struct rexp_node * rx_mk_r_2phase_star (); -RX_DECL struct rexp_node * rx_mk_r_side_effect (); -RX_DECL struct rexp_node * rx_mk_r_data (); -RX_DECL void rx_free_rexp (); -RX_DECL struct rexp_node * rx_copy_rexp (); -RX_DECL struct rx_nfa_state * rx_nfa_state (); -RX_DECL void rx_free_nfa_state (); -RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (); -RX_DECL struct rx_nfa_edge * rx_nfa_edge (); -RX_DECL void rx_free_nfa_edge (); -RX_DECL void rx_free_nfa (); -RX_DECL int rx_build_nfa (); -RX_DECL void rx_name_nfa_states (); -RX_DECL int rx_eclose_nfa (); -RX_DECL void rx_delete_epsilon_transitions (); -RX_DECL int rx_compactify_nfa (); -RX_DECL void rx_release_superset (); -RX_DECL struct rx_superset * rx_superset_cons (); -RX_DECL struct rx_superset * rx_superstate_eclosure_union (); -RX_DECL struct rx_superstate * rx_superstate (); -RX_DECL struct rx_inx * rx_handle_cache_miss (); -RX_DECL reg_errcode_t rx_compile (); -RX_DECL void rx_blow_up_fastmap (); -#endif /* STDC */ - - -#endif /* RX_WANT_RX_DEFS */ - - - -#ifdef __STDC__ -extern int re_search_2 (struct re_pattern_buffer *rxb, - __const__ char * string1, int size1, - __const__ char * string2, int size2, - int startpos, int range, - struct re_registers *regs, - int stop); -extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string, - int size, int startpos, int range, - struct re_registers *regs); -extern int re_match_2 (struct re_pattern_buffer * rxb, - __const__ char * string1, int size1, - __const__ char * string2, int size2, - int pos, struct re_registers *regs, int stop); -extern int re_match (struct re_pattern_buffer * rxb, - __const__ char * string, - int size, int pos, - struct re_registers *regs); -extern reg_syntax_t re_set_syntax (reg_syntax_t syntax); -extern void re_set_registers (struct re_pattern_buffer *bufp, - struct re_registers *regs, - unsigned num_regs, - regoff_t * starts, regoff_t * ends); -extern __const__ char * re_compile_pattern (__const__ char *pattern, - int length, - struct re_pattern_buffer * rxb); -extern int re_compile_fastmap (struct re_pattern_buffer * rxb); -extern char * re_comp (__const__ char *s); -extern int re_exec (__const__ char *s); -extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags); -extern int regexec (__const__ regex_t *preg, __const__ char *string, - size_t nmatch, regmatch_t pmatch[], - int eflags); -extern size_t regerror (int errcode, __const__ regex_t *preg, - char *errbuf, size_t errbuf_size); -extern void regfree (regex_t *preg); - -#else /* STDC */ -extern int re_search_2 (); -extern int re_search (); -extern int re_match_2 (); -extern int re_match (); -extern reg_syntax_t re_set_syntax (); -extern void re_set_registers (); -extern __const__ char * re_compile_pattern (); -extern int re_compile_fastmap (); -extern char * re_comp (); -extern int re_exec (); -extern int regcomp (); -extern int regexec (); -extern size_t regerror (); -extern void regfree (); - -#endif /* STDC */ - - - -#ifdef RX_WANT_RX_DEFS - -struct rx_counter_frame -{ - int tag; - int val; - struct rx_counter_frame * inherited_from; /* If this is a copy. */ - struct rx_counter_frame * cdr; -}; - -struct rx_backtrack_frame -{ - char * counter_stack_sp; - - /* A frame is used to save the matchers state when it crosses a - * backtracking point. The `stk_' fields correspond to variables - * in re_search_2 (just strip off thes `stk_'). They are documented - * tere. - */ - struct rx_superstate * stk_super; - unsigned int stk_c; - struct rx_string_position stk_test_pos; - int stk_last_l; - int stk_last_r; - int stk_test_ret; - - /* This is the list of options left to explore at the backtrack - * point for which this frame was created. - */ - struct rx_distinct_future * df; - struct rx_distinct_future * first_df; - -#ifdef RX_DEBUG - int stk_line_no; -#endif -}; - -struct rx_stack_chunk -{ - struct rx_stack_chunk * next_chunk; - int bytes_left; - char * sp; -}; - -enum rx_outer_entry -{ - rx_outer_start, - rx_outer_fastmap, - rx_outer_test, - rx_outer_restore_pos -}; - -enum rx_fastmap_return -{ - rx_fastmap_continuation, - rx_fastmap_error, - rx_fastmap_ok, - rx_fastmap_fail -}; - -enum rx_fastmap_entry -{ - rx_fastmap_start, - rx_fastmap_string_break -}; - -enum rx_test_return -{ - rx_test_continuation, - rx_test_error, - rx_test_fail, - rx_test_ok -}; - -enum rx_test_internal_return -{ - rx_test_internal_error, - rx_test_found_first, - rx_test_line_finished -}; - -enum rx_test_match_entry -{ - rx_test_start, - rx_test_cache_hit_loop, - rx_test_backreference_check, - rx_test_backtrack_return -}; - -struct rx_search_state -{ - /* Two groups of registers are kept. The group with the register state - * of the current test match, and the group that holds the state at the end - * of the best known match, if any. - * - * For some patterns, there may also be registers saved on the stack. - */ - unsigned num_regs; /* Includes an element for register zero. */ - regoff_t * lparen; /* scratch space for register returns */ - regoff_t * rparen; - regoff_t * best_lpspace; /* in case the user doesn't want these */ - regoff_t * best_rpspace; /* values, we still need space to store - * them. Normally, this memoryis unused - * and the space pointed to by REGS is - * used instead. - */ - - int last_l; /* Highest index of a valid lparen. */ - int last_r; /* It's dual. */ - - int * best_lparen; /* This contains the best known register */ - int * best_rparen; /* assignments. - * This may point to the same mem as - * best_lpspace, or it might point to memory - * passed by the caller. - */ - int best_last_l; /* best_last_l:best_lparen::last_l:lparen */ - int best_last_r; - - - unsigned char * translate; - - struct rx_string_position outer_pos; - - struct rx_superstate * start_super; - int nfa_choice; - int first_found; /* If true, return after finding any match. */ - int ret_val; - - /* For continuations... */ - enum rx_outer_entry outer_search_resume_pt; - struct re_pattern_buffer * saved_rxb; - int saved_startpos; - int saved_range; - int saved_stop; - int saved_total_size; - rx_get_burst_fn saved_get_burst; - rx_back_check_fn saved_back_check; - struct re_registers * saved_regs; - - /** - ** state for fastmap - **/ - char * fastmap; - int fastmap_chr; - int fastmap_val; - - /* for continuations in the fastmap procedure: */ - enum rx_fastmap_entry fastmap_resume_pt; - - /** - ** state for test_match - **/ - - /* The current superNFA position of the matcher. */ - struct rx_superstate * super; - - /* The matcher interprets a series of instruction frames. - * This is the `instruction counter' for the interpretation. - */ - struct rx_inx * ifr; - - /* We insert a ghost character in the string to prime - * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half - * keep track of the test-match position and string-half. - */ - unsigned char c; - - /* Position within the string. */ - struct rx_string_position test_pos; - - struct rx_stack_chunk * counter_stack; - struct rx_stack_chunk * backtrack_stack; - int backtrack_frame_bytes; - int chunk_bytes; - struct rx_stack_chunk * free_chunks; - - /* To return from this function, set test_ret and - * `goto test_do_return'. - * - * Possible return values are: - * 1 --- end of string while the superNFA is still going - * 0 --- internal error (out of memory) - * -1 --- search completed by reaching the superNFA fail state - * -2 --- a match was found, maybe not the longest. - * - * When the search is complete (-1), best_last_r indicates whether - * a match was found. - * - * -2 is return only if search_state.first_found is non-zero. - * - * if search_state.first_found is non-zero, a return of -1 indicates no match, - * otherwise, best_last_r has to be checked. - */ - int test_ret; - - int could_have_continued; - -#ifdef RX_DEBUG - int backtrack_depth; - /* There is a search tree with every node as set of deterministic - * transitions in the super nfa. For every branch of a - * backtrack point is an edge in the tree. - * This counts up a pre-order of nodes in that tree. - * It's saved on the search stack and printed when debugging. - */ - int line_no; - int lines_found; -#endif - - - /* For continuations within the match tester */ - enum rx_test_match_entry test_match_resume_pt; - struct rx_inx * saved_next_tr_table; - struct rx_inx * saved_this_tr_table; - int saved_reg; - struct rx_backtrack_frame * saved_bf; - -}; - - -extern char rx_slowmap[]; -extern unsigned char rx_id_translation[]; - -static __inline__ void -init_fastmap (rxb, search_state) - struct re_pattern_buffer * rxb; - struct rx_search_state * search_state; -{ - search_state->fastmap = (rxb->fastmap - ? (char *)rxb->fastmap - : (char *)rx_slowmap); - /* Update the fastmap now if not correct already. - * When the regexp was compiled, the fastmap was computed - * and stored in a bitset. This expands the bitset into a - * character array containing 1s and 0s. - */ - if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate) - rx_blow_up_fastmap (rxb); - search_state->fastmap_chr = -1; - search_state->fastmap_val = 0; - search_state->fastmap_resume_pt = rx_fastmap_start; -} - -static __inline__ void -uninit_fastmap (rxb, search_state) - struct re_pattern_buffer * rxb; - struct rx_search_state * search_state; -{ - /* Unset the fastmap sentinel */ - if (search_state->fastmap_chr >= 0) - search_state->fastmap[search_state->fastmap_chr] - = search_state->fastmap_val; -} - -static __inline__ int -fastmap_search (rxb, stop, get_burst, app_closure, search_state) - struct re_pattern_buffer * rxb; - int stop; - rx_get_burst_fn get_burst; - void * app_closure; - struct rx_search_state * search_state; -{ - enum rx_fastmap_entry pc; - - if (0) - { - return_continuation: - search_state->fastmap_resume_pt = pc; - return rx_fastmap_continuation; - } - - pc = search_state->fastmap_resume_pt; - - switch (pc) - { - default: - return rx_fastmap_error; - case rx_fastmap_start: - init_fastmap_sentinal: - /* For the sake of fast fastmapping, set a sentinal in the fastmap. - * This sentinal will trap the fastmap loop when it reaches the last - * valid character in a string half. - * - * This must be reset when the fastmap/search loop crosses a string - * boundry, and before returning to the caller. So sometimes, - * the fastmap loop is restarted with `continue', othertimes by - * `goto init_fastmap_sentinal'. - */ - if (search_state->outer_pos.size) - { - search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1) - ? *(search_state->outer_pos.end - 1) - : *search_state->outer_pos.string); - search_state->fastmap_val - = search_state->fastmap[search_state->fastmap_chr]; - search_state->fastmap[search_state->fastmap_chr] = 1; - } - else - { - search_state->fastmap_chr = -1; - search_state->fastmap_val = 0; - } - - if (search_state->outer_pos.pos >= search_state->outer_pos.end) - goto fastmap_hit_bound; - else - { - if (search_state->outer_pos.search_direction == 1) - { - if (search_state->fastmap_val) - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - ++search_state->outer_pos.pos; - return rx_fastmap_ok; - } - } - else - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - ++search_state->outer_pos.pos; - if (*search_state->outer_pos.pos != search_state->fastmap_chr) - return rx_fastmap_ok; - else - { - ++search_state->outer_pos.pos; - if (search_state->outer_pos.pos == search_state->outer_pos.end) - goto fastmap_hit_bound; - } - } - } - } - else - { - __const__ unsigned char * bound; - bound = search_state->outer_pos.string - 1; - if (search_state->fastmap_val) - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - --search_state->outer_pos.pos; - return rx_fastmap_ok; - } - } - else - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - --search_state->outer_pos.pos; - if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val) - return rx_fastmap_ok; - else - { - --search_state->outer_pos.pos; - if (search_state->outer_pos.pos == bound) - goto fastmap_hit_bound; - } - } - } - } - } - - case rx_fastmap_string_break: - fastmap_hit_bound: - { - /* If we hit a bound, it may be time to fetch another burst - * of string, or it may be time to return a continuation to - * the caller, or it might be time to fail. - */ - - int burst_state; - burst_state = get_burst (&search_state->outer_pos, app_closure, stop); - switch (burst_state) - { - default: - case rx_get_burst_error: - return rx_fastmap_error; - case rx_get_burst_continuation: - { - pc = rx_fastmap_string_break; - goto return_continuation; - } - case rx_get_burst_ok: - goto init_fastmap_sentinal; - case rx_get_burst_no_more: - /* ...not a string split, simply no more string. - * - * When searching backward, running out of string - * is reason to quit. - * - * When searching forward, we allow the possibility - * of an (empty) match after the last character in the - * virtual string. So, fall through to the matcher - */ - return ( (search_state->outer_pos.search_direction == 1) - ? rx_fastmap_ok - : rx_fastmap_fail); - } - } - } - -} - - - -#ifdef emacs -/* The `emacs' switch turns on certain matching commands - * that make sense only in Emacs. - */ -#include "config.h" -#include "lisp.h" -#include "buffer.h" -#include "syntax.h" -#endif /* emacs */ - -/* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar - * packages too. - */ -#ifdef RX_MEMDBUG -#include <malloc.h> -#endif /* RX_RX_MEMDBUG */ - -/* We used to test for `BSTRING' here, but only GCC and Emacs define - * `BSTRING', as far as I know, and neither of them use this code. - */ -#if HAVE_STRING_H || STDC_HEADERS -#include <string.h> - -#ifndef bcmp -#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) -#endif - -#ifndef bcopy -#define bcopy(s, d, n) memcpy ((d), (s), (n)) -#endif - -#ifndef bzero -#define bzero(s, n) memset ((s), 0, (n)) -#endif - -#else /* HAVE_STRING_H || STDC_HEADERS */ -#include <strings.h> -#endif /* not (HAVE_STRING_H || STDC_HEADERS) */ - -#ifdef STDC_HEADERS -#include <stdlib.h> -#else /* not STDC_HEADERS */ -char *malloc (); -char *realloc (); -#endif /* not STDC_HEADERS */ - - - - -/* How many characters in the character set. */ -#define CHAR_SET_SIZE (1 << CHARBITS) - -#ifndef emacs -/* Define the syntax basics for \<, \>, etc. - * This must be nonzero for the wordchar and notwordchar pattern - * commands in re_match_2. - */ -#ifndef Sword -#define Sword 1 -#endif -#define SYNTAX(c) re_syntax_table[c] -RX_DECL char re_syntax_table[CHAR_SET_SIZE]; -#endif /* not emacs */ - - -/* Test if at very beginning or at very end of the virtual concatenation - * of `string1' and `string2'. If only one string, it's `string2'. - */ - -#define AT_STRINGS_BEG() \ - ( -1 \ - == ((search_state.test_pos.pos - search_state.test_pos.string) \ - + search_state.test_pos.offset)) - -#define AT_STRINGS_END() \ - ( (total_size - 1) \ - == ((search_state.test_pos.pos - search_state.test_pos.string) \ - + search_state.test_pos.offset)) - - -/* Test if POS + 1 points to a character which is word-constituent. We have - * two special cases to check for: if past the end of string1, look at - * the first character in string2; and if before the beginning of - * string2, look at the last character in string1. - * - * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). - */ -#define LETTER_P(POS,OFF) \ - ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \ - == Sword) - -/* Test if the character at D and the one after D differ with respect - * to being word-constituent. - */ -#define AT_WORD_BOUNDARY(d) \ - (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1)) - - -#ifdef RX_SUPPORT_CONTINUATIONS -#define RX_STACK_ALLOC(BYTES) malloc(BYTES) -#define RX_STACK_FREE(MEM) free(MEM) -#else -#define RX_STACK_ALLOC(BYTES) alloca(BYTES) -#define RX_STACK_FREE(MEM) \ - ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \ - search_state.free_chunks = ((struct rx_stack_chunk *)MEM); - -#endif - -#define PUSH(CHUNK_VAR,BYTES) \ - if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \ - { \ - struct rx_stack_chunk * new_chunk; \ - if (search_state.free_chunks) \ - { \ - new_chunk = search_state.free_chunks; \ - search_state.free_chunks = search_state.free_chunks->next_chunk; \ - } \ - else \ - { \ - new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \ - if (!new_chunk) \ - { \ - search_state.ret_val = 0; \ - goto test_do_return; \ - } \ - } \ - new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \ - new_chunk->bytes_left = (search_state.chunk_bytes \ - - (BYTES) \ - - sizeof (struct rx_stack_chunk)); \ - new_chunk->next_chunk = CHUNK_VAR; \ - CHUNK_VAR = new_chunk; \ - } \ - else \ - (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES)) - -#define POP(CHUNK_VAR,BYTES) \ - if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \ - { \ - struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \ - RX_STACK_FREE(CHUNK_VAR); \ - CHUNK_VAR = new_chunk; \ - } \ - else \ - (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES) - - - -#define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)] - - - - -#ifdef __STDC__ -RX_DECL __inline__ int -rx_search (struct re_pattern_buffer * rxb, - int startpos, - int range, - int stop, - int total_size, - rx_get_burst_fn get_burst, - rx_back_check_fn back_check, - rx_fetch_char_fn fetch_char, - void * app_closure, - struct re_registers * regs, - struct rx_search_state * resume_state, - struct rx_search_state * save_state) -#else -RX_DECL __inline__ int -rx_search (rxb, startpos, range, stop, total_size, - get_burst, back_check, fetch_char, - app_closure, regs, resume_state, save_state) - struct re_pattern_buffer * rxb; - int startpos; - int range; - int stop; - int total_size; - rx_get_burst_fn get_burst; - rx_back_check_fn back_check; - rx_fetch_char_fn fetch_char; - void * app_closure; - struct re_registers * regs; - struct rx_search_state * resume_state; - struct rx_search_state * save_state; -#endif -{ - int pc; - int test_state; - struct rx_search_state search_state; - - search_state.free_chunks = 0; - if (!resume_state) - pc = rx_outer_start; - else - { - search_state = *resume_state; - regs = search_state.saved_regs; - rxb = search_state.saved_rxb; - startpos = search_state.saved_startpos; - range = search_state.saved_range; - stop = search_state.saved_stop; - total_size = search_state.saved_total_size; - get_burst = search_state.saved_get_burst; - back_check = search_state.saved_back_check; - pc = search_state.outer_search_resume_pt; - if (0) - { - return_continuation: - if (save_state) - { - *save_state = search_state; - save_state->saved_regs = regs; - save_state->saved_rxb = rxb; - save_state->saved_startpos = startpos; - save_state->saved_range = range; - save_state->saved_stop = stop; - save_state->saved_total_size = total_size; - save_state->saved_get_burst = get_burst; - save_state->saved_back_check = back_check; - save_state->outer_search_resume_pt = pc; - } - return rx_search_continuation; - } - } - - switch (pc) - { - case rx_outer_start: - search_state.ret_val = rx_search_fail; - ( search_state.lparen - = search_state.rparen - = search_state.best_lpspace - = search_state.best_rpspace - = 0); - - /* figure the number of registers we may need for use in backreferences. - * the number here includes an element for register zero. - */ - search_state.num_regs = rxb->re_nsub + 1; - - - /* check for out-of-range startpos. */ - if ((startpos < 0) || (startpos > total_size)) - return rx_search_fail; - - /* fix up range if it might eventually take us outside the string. */ - { - int endpos; - endpos = startpos + range; - if (endpos < -1) - range = (-1 - startpos); - else if (endpos > (total_size + 1)) - range = total_size - startpos; - } - - /* if the search isn't to be a backwards one, don't waste time in a - * long search for a pattern that says it is anchored. - */ - if (rxb->begbuf_only && (range > 0)) - { - if (startpos > 0) - return rx_search_fail; - else - range = 1; - } - - /* decide whether to use internal or user-provided reg buffers. */ - if (!regs || rxb->no_sub) - { - search_state.best_lpspace = - (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.best_rpspace = - (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.best_lparen = search_state.best_lpspace; - search_state.best_rparen = search_state.best_rpspace; - } - else - { - /* have the register data arrays been allocated? */ - if (rxb->regs_allocated == REGS_UNALLOCATED) - { /* no. so allocate them with malloc. we need one - extra element beyond `search_state.num_regs' for the `-1' marker - gnu code uses. */ - regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1); - regs->start = ((regoff_t *) - malloc (regs->num_regs * sizeof ( regoff_t))); - regs->end = ((regoff_t *) - malloc (regs->num_regs * sizeof ( regoff_t))); - if (regs->start == 0 || regs->end == 0) - return rx_search_error; - rxb->regs_allocated = REGS_REALLOCATE; - } - else if (rxb->regs_allocated == REGS_REALLOCATE) - { /* yes. if we need more elements than were already - allocated, reallocate them. if we need fewer, just - leave it alone. */ - if (regs->num_regs < search_state.num_regs + 1) - { - regs->num_regs = search_state.num_regs + 1; - regs->start = ((regoff_t *) - realloc (regs->start, - regs->num_regs * sizeof (regoff_t))); - regs->end = ((regoff_t *) - realloc (regs->end, - regs->num_regs * sizeof ( regoff_t))); - if (regs->start == 0 || regs->end == 0) - return rx_search_error; - } - } - else if (rxb->regs_allocated != REGS_FIXED) - return rx_search_error; - - if (regs->num_regs < search_state.num_regs + 1) - { - search_state.best_lpspace = - ((regoff_t *) - REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); - search_state.best_rpspace = - ((regoff_t *) - REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); - search_state.best_lparen = search_state.best_lpspace; - search_state.best_rparen = search_state.best_rpspace; - } - else - { - search_state.best_lparen = regs->start; - search_state.best_rparen = regs->end; - } - } - - search_state.lparen = - (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.rparen = - (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - - if (! ( search_state.best_rparen - && search_state.best_lparen - && search_state.lparen && search_state.rparen)) - return rx_search_error; - - search_state.best_last_l = search_state.best_last_r = -1; - - search_state.translate = (rxb->translate - ? rxb->translate - : rx_id_translation); - - - - /* - * two nfa's were compiled. - * `0' is complete. - * `1' faster but gets registers wrong and ends too soon. - */ - search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1'; - - /* we have the option to look for the best match or the first - * one we can find. if the user isn't asking for register information, - * we don't need to find the best match. - */ - search_state.first_found = !regs; - - if (range >= 0) - { - search_state.outer_pos.search_end = startpos + range; - search_state.outer_pos.search_direction = 1; - } - else - { - search_state.outer_pos.search_end = startpos + range; - search_state.outer_pos.search_direction = -1; - } - - /* the vacuous search always turns up nothing. */ - if ((search_state.outer_pos.search_direction == 1) - ? (startpos > search_state.outer_pos.search_end) - : (startpos < search_state.outer_pos.search_end)) - return rx_search_fail; - - /* now we build the starting state of the supernfa. */ - { - struct rx_superset * start_contents; - struct rx_nfa_state_set * start_nfa_set; - - /* we presume here that the nfa start state has only one - * possible future with no side effects. - */ - start_nfa_set = rxb->start->futures->destset; - if ( rxb->rx.start_set - && (rxb->rx.start_set->starts_for == &rxb->rx)) - start_contents = rxb->rx.start_set; - else - { - start_contents = - rx_superstate_eclosure_union (&rxb->rx, - rx_superset_cons (&rxb->rx, 0, 0), - start_nfa_set); - - if (!start_contents) - return rx_search_fail; - - start_contents->starts_for = &rxb->rx; - rxb->rx.start_set = start_contents; - } - if ( start_contents->superstate - && (start_contents->superstate->rx_id == rxb->rx.rx_id)) - { - search_state.start_super = start_contents->superstate; - rx_lock_superstate (&rxb->rx, search_state.start_super); - } - else - { - rx_protect_superset (&rxb->rx, start_contents); - - search_state.start_super = rx_superstate (&rxb->rx, start_contents); - if (!search_state.start_super) - return rx_search_fail; - rx_lock_superstate (&rxb->rx, search_state.start_super); - rx_release_superset (&rxb->rx, start_contents); - } - } - - - /* The outer_pos tracks the position within the strings - * as seen by loop that calls fastmap_search. - * - * The caller supplied get_burst function actually - * gives us pointers to chars. - * - * Communication with the get_burst function is through an - * rx_string_position structure. Here, the structure for - * outer_pos is initialized. It is set to point to the - * NULL string, at an offset of STARTPOS. STARTPOS is out - * of range of the NULL string, so the first call to - * getburst will patch up the rx_string_position to point - * to valid characters. - */ - - ( search_state.outer_pos.string - = search_state.outer_pos.end - = 0); - - search_state.outer_pos.offset = 0; - search_state.outer_pos.size = 0; - search_state.outer_pos.pos = (unsigned char *)startpos; - init_fastmap (rxb, &search_state); - - search_state.fastmap_resume_pt = rx_fastmap_start; - case rx_outer_fastmap: - /* do { */ - pseudo_do: - { - { - int fastmap_state; - fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure, - &search_state); - switch (fastmap_state) - { - case rx_fastmap_continuation: - pc = rx_outer_fastmap; - goto return_continuation; - case rx_fastmap_fail: - goto finish; - case rx_fastmap_ok: - break; - } - } - - /* now the fastmap loop has brought us to a plausible - * starting point for a match. so, it's time to run the - * nfa and see if a match occured. - */ - startpos = ( search_state.outer_pos.pos - - search_state.outer_pos.string - + search_state.outer_pos.offset); -#if 0 -/*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end)) -/*|*/ goto finish; -#endif - } - - search_state.test_match_resume_pt = rx_test_start; - /* do interrupted for entry point... */ - case rx_outer_test: - /* ...do continued */ - { - goto test_match; - test_returns_to_search: - switch (test_state) - { - case rx_test_continuation: - pc = rx_outer_test; - goto return_continuation; - case rx_test_error: - search_state.ret_val = rx_search_error; - goto finish; - case rx_test_fail: - break; - case rx_test_ok: - goto finish; - } - search_state.outer_pos.pos += search_state.outer_pos.search_direction; - startpos += search_state.outer_pos.search_direction; -#if 0 -/*|*/ if (search_state.test_pos.pos < search_state.test_pos.end) -/*|*/ break; -#endif - } - /* do interrupted for entry point... */ - case rx_outer_restore_pos: - { - int x; - x = get_burst (&search_state.outer_pos, app_closure, stop); - switch (x) - { - case rx_get_burst_continuation: - pc = rx_outer_restore_pos; - goto return_continuation; - case rx_get_burst_error: - search_state.ret_val = rx_search_error; - goto finish; - case rx_get_burst_no_more: - if (rxb->can_match_empty) - break; - goto finish; - case rx_get_burst_ok: - break; - } - } /* } while (...see below...) */ - - if ((search_state.outer_pos.search_direction == 1) - ? (startpos <= search_state.outer_pos.search_end) - : (startpos > search_state.outer_pos.search_end)) - goto pseudo_do; - - - finish: - uninit_fastmap (rxb, &search_state); - if (search_state.start_super) - rx_unlock_superstate (&rxb->rx, search_state.start_super); - -#ifdef regex_malloc - if (search_state.lparen) free (search_state.lparen); - if (search_state.rparen) free (search_state.rparen); - if (search_state.best_lpspace) free (search_state.best_lpspace); - if (search_state.best_rpspace) free (search_state.best_rpspace); -#endif - return search_state.ret_val; - } - - - test_match: - { - enum rx_test_match_entry test_pc; - int inx; - test_pc = search_state.test_match_resume_pt; - if (test_pc == rx_test_start) - { -#ifdef RX_DEBUG - search_state.backtrack_depth = 0; -#endif - search_state.last_l = search_state.last_r = 0; - search_state.lparen[0] = startpos; - search_state.super = search_state.start_super; - search_state.c = search_state.nfa_choice; - search_state.test_pos.pos = search_state.outer_pos.pos - 1; - search_state.test_pos.string = search_state.outer_pos.string; - search_state.test_pos.end = search_state.outer_pos.end; - search_state.test_pos.offset = search_state.outer_pos.offset; - search_state.test_pos.size = search_state.outer_pos.size; - search_state.test_pos.search_direction = 1; - search_state.counter_stack = 0; - search_state.backtrack_stack = 0; - search_state.backtrack_frame_bytes = - (sizeof (struct rx_backtrack_frame) - + (rxb->match_regs_on_stack - ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2 - : 0)); - search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64; - search_state.test_ret = rx_test_line_finished; - search_state.could_have_continued = 0; - } - /* This is while (1)...except that the body of the loop is interrupted - * by some alternative entry points. - */ - pseudo_while_1: - switch (test_pc) - { - case rx_test_cache_hit_loop: - goto resume_continuation_1; - case rx_test_backreference_check: - goto resume_continuation_2; - case rx_test_backtrack_return: - goto resume_continuation_3; - case rx_test_start: -#ifdef RX_DEBUG - /* There is a search tree with every node as set of deterministic - * transitions in the super nfa. For every branch of a - * backtrack point is an edge in the tree. - * This counts up a pre-order of nodes in that tree. - * It's saved on the search stack and printed when debugging. - */ - search_state.line_no = 0; - search_state.lines_found = 0; -#endif - - top_of_cycle: - /* A superstate is basicly a transition table, indexed by - * characters from the string being tested, and containing - * RX_INX (`instruction frame') structures. - */ - search_state.ifr = &search_state.super->transitions [search_state.c]; - - recurse_test_match: - /* This is the point to which control is sent when the - * test matcher `recurses'. Before jumping here, some variables - * need to be saved on the stack and the next instruction frame - * has to be computed. - */ - - restart: - /* Some instructions don't advance the matcher, but just - * carry out some side effects and fetch a new instruction. - * To dispatch that new instruction, `goto restart'. - */ - - { - struct rx_inx * next_tr_table; - struct rx_inx * this_tr_table; - /* The fastest route through the loop is when the instruction - * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA - * is non-zero. In that case, it points to the next - * superstate. - * - * This allows us to not bother fetching the bytecode. - */ - next_tr_table = (struct rx_inx *)search_state.ifr->data; - this_tr_table = search_state.super->transitions; - while (next_tr_table) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp; - - fprintf (stderr, "%d %d>> re_next_char @ %d (%d)", - search_state.line_no, - search_state.backtrack_depth, - (search_state.test_pos.pos - search_state.test_pos.string - + search_state.test_pos.offset), search_state.c); - - search_state.super = - ((struct rx_superstate *) - ((char *)this_tr_table - - ((unsigned long) - ((struct rx_superstate *)0)->transitions))); - - setp = search_state.super->contents; - fprintf (stderr, " superstet (rx=%d, &=%x: ", - rxb->rx.rx_id, setp); - while (setp) - { - fprintf (stderr, "%d ", setp->id); - setp = setp->cdr; - } - fprintf (stderr, "\n"); - } -#endif - this_tr_table = next_tr_table; - ++search_state.test_pos.pos; - if (search_state.test_pos.pos == search_state.test_pos.end) - { - int burst_state; - try_burst_1: - burst_state = get_burst (&search_state.test_pos, - app_closure, stop); - switch (burst_state) - { - case rx_get_burst_continuation: - search_state.saved_this_tr_table = this_tr_table; - search_state.saved_next_tr_table = next_tr_table; - test_pc = rx_test_cache_hit_loop; - goto test_return_continuation; - - resume_continuation_1: - /* Continuation one jumps here to do its work: */ - search_state.saved_this_tr_table = this_tr_table; - search_state.saved_next_tr_table = next_tr_table; - goto try_burst_1; - - case rx_get_burst_ok: - /* get_burst succeeded...keep going */ - break; - - case rx_get_burst_no_more: - search_state.test_ret = rx_test_line_finished; - search_state.could_have_continued = 1; - goto test_do_return; - - case rx_get_burst_error: - /* An error... */ - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - } - search_state.c = *search_state.test_pos.pos; - search_state.ifr = this_tr_table + search_state.c; - next_tr_table = (struct rx_inx *)search_state.ifr->data; - } /* Fast loop through cached transition tables */ - - /* Here when we ran out of cached next-char transitions. - * So, it will be necessary to do a more expensive - * dispatch on the current instruction. The superstate - * pointer is allowed to become invalid during next-char - * transitions -- now we must bring it up to date. - */ - search_state.super = - ((struct rx_superstate *) - ((char *)this_tr_table - - ((unsigned long) - ((struct rx_superstate *)0)->transitions))); - } - - /* We've encountered an instruction other than next-char. - * Dispatch that instruction: - */ - inx = (int)search_state.ifr->inx; -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp = search_state.super->contents; - - fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no, - search_state.backtrack_depth, - inx_names[inx], - (search_state.test_pos.pos - search_state.test_pos.string - + (test_pos.half == 0 ? 0 : size1)), search_state.c); - - fprintf (stderr, " superstet (rx=%d, &=%x: ", - rxb->rx.rx_id, setp); - while (setp) - { - fprintf (stderr, "%d ", setp->id); - setp = setp->cdr; - } - fprintf (stderr, "\n"); - } -#endif - switch ((enum rx_opcode)inx) - { - case rx_do_side_effects: - - /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon - * edges associated with parentheses, backreferencing, etc. - */ - { - struct rx_distinct_future * df = - (struct rx_distinct_future *)search_state.ifr->data_2; - struct rx_se_list * el = df->effects; - /* Side effects come in lists. This walks down - * a list, dispatching. - */ - while (el) - { - long effect; - effect = (long)el->car; - if (effect < 0) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp = search_state.super->contents; - - fprintf (stderr, "....%d %d>> %s\n", search_state.line_no, - search_state.backtrack_depth, - efnames[-effect]); - } -#endif - switch ((enum re_side_effects) effect) - - { - case re_se_pushback: - search_state.ifr = &df->future_frame; - if (!search_state.ifr->data) - { - struct rx_superstate * sup; - sup = search_state.super; - rx_lock_superstate (rx, sup); - if (!rx_handle_cache_miss (&rxb->rx, - search_state.super, - search_state.c, - (search_state.ifr - ->data_2))) - { - rx_unlock_superstate (rx, sup); - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - rx_unlock_superstate (rx, sup); - } - /* --search_state.test_pos.pos; */ - search_state.c = 't'; - search_state.super - = ((struct rx_superstate *) - ((char *)search_state.ifr->data - - (long)(((struct rx_superstate *)0) - ->transitions))); - goto top_of_cycle; - break; - case re_se_push0: - { - struct rx_counter_frame * old_cf - = (search_state.counter_stack - ? ((struct rx_counter_frame *) - search_state.counter_stack->sp) - : 0); - struct rx_counter_frame * cf; - PUSH (search_state.counter_stack, - sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - cf->tag = re_se_iter; - cf->val = 0; - cf->inherited_from = 0; - cf->cdr = old_cf; - break; - } - case re_se_fail: - goto test_do_return; - case re_se_begbuf: - if (!AT_STRINGS_BEG ()) - goto test_do_return; - break; - case re_se_endbuf: - if (!AT_STRINGS_END ()) - goto test_do_return; - break; - case re_se_wordbeg: - if ( LETTER_P (&search_state.test_pos, 1) - && ( AT_STRINGS_BEG() - || !LETTER_P (&search_state.test_pos, 0))) - break; - else - goto test_do_return; - case re_se_wordend: - if ( !AT_STRINGS_BEG () - && LETTER_P (&search_state.test_pos, 0) - && (AT_STRINGS_END () - || !LETTER_P (&search_state.test_pos, 1))) - break; - else - goto test_do_return; - case re_se_wordbound: - if (AT_WORD_BOUNDARY (&search_state.test_pos)) - break; - else - goto test_do_return; - case re_se_notwordbound: - if (!AT_WORD_BOUNDARY (&search_state.test_pos)) - break; - else - goto test_do_return; - case re_se_hat: - if (AT_STRINGS_BEG ()) - { - if (rxb->not_bol) - goto test_do_return; - else - break; - } - else - { - char pos_c = *search_state.test_pos.pos; - if ( (SRCH_TRANSLATE (pos_c) - == SRCH_TRANSLATE('\n')) - && rxb->newline_anchor) - break; - else - goto test_do_return; - } - case re_se_dollar: - if (AT_STRINGS_END ()) - { - if (rxb->not_eol) - goto test_do_return; - else - break; - } - else - { - if ( ( SRCH_TRANSLATE (fetch_char - (&search_state.test_pos, 1, - app_closure, stop)) - == SRCH_TRANSLATE ('\n')) - && rxb->newline_anchor) - break; - else - goto test_do_return; - } - - case re_se_try: - /* This is the first side effect in every - * expression. - * - * FOR NO GOOD REASON...get rid of it... - */ - break; - - case re_se_pushpos: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - struct rx_counter_frame * old_cf - = (search_state.counter_stack - ? ((struct rx_counter_frame *) - search_state.counter_stack->sp) - : 0); - struct rx_counter_frame * cf; - PUSH(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - cf->tag = re_se_pushpos; - cf->val = urhere; - cf->inherited_from = 0; - cf->cdr = old_cf; - break; - } - - case re_se_chkpos: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - struct rx_counter_frame * cf - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (cf->val == urhere) - goto test_do_return; - cf->val = urhere; - break; - } - break; - - case re_se_poppos: - POP(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - break; - - - case re_se_at_dot: - case re_se_syntax: - case re_se_not_syntax: -#ifdef emacs - /* - * this release lacks emacs support - */ -#endif - break; - case re_se_win: - case re_se_lparen: - case re_se_rparen: - case re_se_backref: - case re_se_iter: - case re_se_end_iter: - case re_se_tv: - case re_floogle_flap: - search_state.ret_val = 0; - goto test_do_return; - } - } - else - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no, - search_state.backtrack_depth, - efnames2[rxb->se_params [effect].se], - rxb->se_params [effect].op1, - rxb->se_params [effect].op2); -#endif - switch (rxb->se_params [effect].se) - { - case re_se_win: - /* This side effect indicates that we've - * found a match, though not necessarily the - * best match. This is a fancy assignment to - * register 0 unless the caller didn't - * care about registers. In which case, - * this stops the match. - */ - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - - if ( (search_state.best_last_r < 0) - || (urhere + 1 > search_state.best_rparen[0])) - { - /* Record the best known and keep - * looking. - */ - int x; - for (x = 0; x <= search_state.last_l; ++x) - search_state.best_lparen[x] = search_state.lparen[x]; - search_state.best_last_l = search_state.last_l; - for (x = 0; x <= search_state.last_r; ++x) - search_state.best_rparen[x] = search_state.rparen[x]; - search_state.best_rparen[0] = urhere + 1; - search_state.best_last_r = search_state.last_r; - } - /* If we're not reporting the match-length - * or other register info, we need look no - * further. - */ - if (search_state.first_found) - { - search_state.test_ret = rx_test_found_first; - goto test_do_return; - } - } - break; - case re_se_lparen: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - - int reg = rxb->se_params [effect].op1; -#if 0 - if (reg > search_state.last_l) -#endif - { - search_state.lparen[reg] = urhere + 1; - /* In addition to making this assignment, - * we now know that lower numbered regs - * that haven't already been assigned, - * won't be. We make sure they're - * filled with -1, so they can be - * recognized as unassigned. - */ - if (search_state.last_l < reg) - while (++search_state.last_l < reg) - search_state.lparen[search_state.last_l] = -1; - } - break; - } - - case re_se_rparen: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - int reg = rxb->se_params [effect].op1; - search_state.rparen[reg] = urhere + 1; - if (search_state.last_r < reg) - { - while (++search_state.last_r < reg) - search_state.rparen[search_state.last_r] - = -1; - } - break; - } - - case re_se_backref: - { - int reg = rxb->se_params [effect].op1; - if ( reg > search_state.last_r - || search_state.rparen[reg] < 0) - goto test_do_return; - - { - int backref_status; - check_backreference: - backref_status - = back_check (&search_state.test_pos, - search_state.lparen[reg], - search_state.rparen[reg], - search_state.translate, - app_closure, - stop); - switch (backref_status) - { - case rx_back_check_continuation: - search_state.saved_reg = reg; - test_pc = rx_test_backreference_check; - goto test_return_continuation; - resume_continuation_2: - reg = search_state.saved_reg; - goto check_backreference; - case rx_back_check_fail: - /* Fail */ - goto test_do_return; - case rx_back_check_pass: - /* pass -- - * test_pos now advanced to last - * char matched by backref - */ - break; - } - } - break; - } - case re_se_iter: - { - struct rx_counter_frame * csp - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (csp->val == rxb->se_params[effect].op2) - goto test_do_return; - else - ++csp->val; - break; - } - case re_se_end_iter: - { - struct rx_counter_frame * csp - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (csp->val < rxb->se_params[effect].op1) - goto test_do_return; - else - { - struct rx_counter_frame * source = csp; - while (source->inherited_from) - source = source->inherited_from; - if (!source || !source->cdr) - { - POP(search_state.counter_stack, - sizeof(struct rx_counter_frame)); - } - else - { - source = source->cdr; - csp->val = source->val; - csp->tag = source->tag; - csp->cdr = 0; - csp->inherited_from = source; - } - } - break; - } - case re_se_tv: - /* is a noop */ - break; - case re_se_try: - case re_se_pushback: - case re_se_push0: - case re_se_pushpos: - case re_se_chkpos: - case re_se_poppos: - case re_se_at_dot: - case re_se_syntax: - case re_se_not_syntax: - case re_se_begbuf: - case re_se_hat: - case re_se_wordbeg: - case re_se_wordbound: - case re_se_notwordbound: - case re_se_wordend: - case re_se_endbuf: - case re_se_dollar: - case re_se_fail: - case re_floogle_flap: - search_state.ret_val = 0; - goto test_do_return; - } - } - el = el->cdr; - } - /* Now the side effects are done, - * so get the next instruction. - * and move on. - */ - search_state.ifr = &df->future_frame; - goto restart; - } - - case rx_backtrack_point: - { - /* A backtrack point indicates that we've reached a - * non-determinism in the superstate NFA. This is a - * loop that exhaustively searches the possibilities. - * - * A backtracking strategy is used. We keep track of what - * registers are valid so we can erase side effects. - * - * First, make sure there is some stack space to hold - * our state. - */ - - struct rx_backtrack_frame * bf; - - PUSH(search_state.backtrack_stack, - search_state.backtrack_frame_bytes); -#ifdef RX_DEBUG_0 - ++search_state.backtrack_depth; -#endif - - bf = ((struct rx_backtrack_frame *) - search_state.backtrack_stack->sp); - { - bf->stk_super = search_state.super; - /* We prevent the current superstate from being - * deleted from the superstate cache. - */ - rx_lock_superstate (&rxb->rx, search_state.super); -#ifdef RX_DEBUG_0 - bf->stk_search_state.line_no = search_state.line_no; -#endif - bf->stk_c = search_state.c; - bf->stk_test_pos = search_state.test_pos; - bf->stk_last_l = search_state.last_l; - bf->stk_last_r = search_state.last_r; - bf->df = ((struct rx_super_edge *) - search_state.ifr->data_2)->options; - bf->first_df = bf->df; - bf->counter_stack_sp = (search_state.counter_stack - ? search_state.counter_stack->sp - : 0); - bf->stk_test_ret = search_state.test_ret; - if (rxb->match_regs_on_stack) - { - int x; - regoff_t * stk = - (regoff_t *)((char *)bf + sizeof (*bf)); - for (x = 0; x <= search_state.last_l; ++x) - stk[x] = search_state.lparen[x]; - stk += x; - for (x = 0; x <= search_state.last_r; ++x) - stk[x] = search_state.rparen[x]; - } - } - - /* Here is a while loop whose body is mainly a function - * call and some code to handle a return from that - * function. - * - * From here on for the rest of `case backtrack_point' it - * is unsafe to assume that the search_state copies of - * variables saved on the backtracking stack are valid - * -- so read their values from the backtracking stack. - * - * This lets us use one generation fewer stack saves in - * the call-graph of a search. - */ - - while_non_det_options: -#ifdef RX_DEBUG_0 - ++search_state.lines_found; - if (rx_debug_trace) - fprintf (stderr, "@@@ %d calls %d @@@\n", - search_state.line_no, search_state.lines_found); - - search_state.line_no = search_state.lines_found; -#endif - - if (bf->df->next_same_super_edge[0] == bf->first_df) - { - /* This is a tail-call optimization -- we don't recurse - * for the last of the possible futures. - */ - search_state.ifr = (bf->df->effects - ? &bf->df->side_effects_frame - : &bf->df->future_frame); - - rx_unlock_superstate (&rxb->rx, search_state.super); - POP(search_state.backtrack_stack, - search_state.backtrack_frame_bytes); -#ifdef RX_DEBUG - --search_state.backtrack_depth; -#endif - goto restart; - } - else - { - if (search_state.counter_stack) - { - struct rx_counter_frame * old_cf - = ((struct rx_counter_frame *)search_state.counter_stack->sp); - struct rx_counter_frame * cf; - PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *)search_state.counter_stack->sp); - cf->tag = old_cf->tag; - cf->val = old_cf->val; - cf->inherited_from = old_cf; - cf->cdr = 0; - } - /* `Call' this test-match block */ - search_state.ifr = (bf->df->effects - ? &bf->df->side_effects_frame - : &bf->df->future_frame); - goto recurse_test_match; - } - - /* Returns in this block are accomplished by - * goto test_do_return. There are two cases. - * If there is some search-stack left, - * then it is a return from a `recursive' call. - * If there is no search-stack left, then - * we should return to the fastmap/search loop. - */ - - test_do_return: - - if (!search_state.backtrack_stack) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "!!! %d bails returning %d !!!\n", - search_state.line_no, search_state.test_ret); -#endif - - /* No more search-stack -- this test is done. */ - if (search_state.test_ret != rx_test_internal_error) - goto return_from_test_match; - else - goto error_in_testing_match; - } - - /* Returning from a recursive call to - * the test match block: - */ - - bf = ((struct rx_backtrack_frame *) - search_state.backtrack_stack->sp); -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "+++ %d returns %d (to %d)+++\n", - search_state.line_no, - search_state.test_ret, - bf->stk_search_state.line_no); -#endif - - while (search_state.counter_stack - && (!bf->counter_stack_sp - || (bf->counter_stack_sp - != search_state.counter_stack->sp))) - { - POP(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - } - - if (search_state.test_ret == rx_test_internal_error) - { - POP (search_state.backtrack_stack, - search_state.backtrack_frame_bytes); - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - - /* If a non-longest match was found and that is good - * enough, return immediately. - */ - if ( (search_state.test_ret == rx_test_found_first) - && search_state.first_found) - { - rx_unlock_superstate (&rxb->rx, bf->stk_super); - POP (search_state.backtrack_stack, - search_state.backtrack_frame_bytes); - goto test_do_return; - } - - search_state.test_ret = bf->stk_test_ret; - search_state.last_l = bf->stk_last_l; - search_state.last_r = bf->stk_last_r; - bf->df = bf->df->next_same_super_edge[0]; - search_state.super = bf->stk_super; - search_state.c = bf->stk_c; -#ifdef RX_DEBUG_0 - search_state.line_no = bf->stk_search_state.line_no; -#endif - - if (rxb->match_regs_on_stack) - { - int x; - regoff_t * stk = - (regoff_t *)((char *)bf + sizeof (*bf)); - for (x = 0; x <= search_state.last_l; ++x) - search_state.lparen[x] = stk[x]; - stk += x; - for (x = 0; x <= search_state.last_r; ++x) - search_state.rparen[x] = stk[x]; - } - - { - int x; - try_burst_2: - x = get_burst (&bf->stk_test_pos, app_closure, stop); - switch (x) - { - case rx_get_burst_continuation: - search_state.saved_bf = bf; - test_pc = rx_test_backtrack_return; - goto test_return_continuation; - resume_continuation_3: - bf = search_state.saved_bf; - goto try_burst_2; - case rx_get_burst_no_more: - /* Since we've been here before, it is some kind of - * error that we can't return. - */ - case rx_get_burst_error: - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - case rx_get_burst_ok: - break; - } - } - search_state.test_pos = bf->stk_test_pos; - goto while_non_det_options; - } - - - case rx_cache_miss: - /* Because the superstate NFA is lazily constructed, - * and in fact may erode from underneath us, we sometimes - * have to construct the next instruction from the hard way. - * This invokes one step in the lazy-conversion. - */ - search_state.ifr = rx_handle_cache_miss (&rxb->rx, - search_state.super, - search_state.c, - search_state.ifr->data_2); - if (!search_state.ifr) - { - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - goto restart; - - case rx_backtrack: - /* RX_BACKTRACK means that we've reached the empty - * superstate, indicating that match can't succeed - * from this point. - */ - goto test_do_return; - - case rx_next_char: - case rx_error_inx: - case rx_num_instructions: - search_state.ret_val = 0; - goto test_do_return; - } - goto pseudo_while_1; - } - - /* Healthy exits from the test-match loop do a - * `goto return_from_test_match' On the other hand, - * we might end up here. - */ - error_in_testing_match: - test_state = rx_test_error; - goto test_returns_to_search; - - /***** fastmap/search loop body - * considering the results testing for a match - */ - - return_from_test_match: - - if (search_state.best_last_l >= 0) - { - if (regs && (regs->start != search_state.best_lparen)) - { - bcopy (search_state.best_lparen, regs->start, - regs->num_regs * sizeof (int)); - bcopy (search_state.best_rparen, regs->end, - regs->num_regs * sizeof (int)); - } - if (regs && !rxb->no_sub) - { - int q; - int bound = (regs->num_regs > search_state.num_regs - ? regs->num_regs - : search_state.num_regs); - regoff_t * s = regs->start; - regoff_t * e = regs->end; - for (q = search_state.best_last_l + 1; q < bound; ++q) - s[q] = e[q] = -1; - } - search_state.ret_val = search_state.best_lparen[0]; - test_state = rx_test_ok; - goto test_returns_to_search; - } - else - { - test_state = rx_test_fail; - goto test_returns_to_search; - } - - test_return_continuation: - search_state.test_match_resume_pt = test_pc; - test_state = rx_test_continuation; - goto test_returns_to_search; - } -} - - - -#endif /* RX_WANT_RX_DEFS */ - - - -#else /* RX_WANT_SE_DEFS */ - /* Integers are used to represent side effects. - * - * Simple side effects are given negative integer names by these enums. - * - * Non-negative names are reserved for complex effects. - * - * Complex effects are those that take arguments. For example, - * a register assignment associated with a group is complex because - * it requires an argument to tell which group is being matched. - * - * The integer name of a complex effect is an index into rxb->se_params. - */ - - RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */ - - RX_DEF_SE(0, re_se_pushback, = re_se_try - 1) - RX_DEF_SE(0, re_se_push0, = re_se_pushback -1) - RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1) - RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1) - RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1) - - RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */ - RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */ - RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */ - - RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */ - RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */ - - RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1) - RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1) - RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1) - - RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1) - RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1) - - /* This fails except at the end of a line. - * It deserves to go here since it is typicly one of the last steps - * in a match. - */ - RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1) - - /* Simple effects: */ - RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1) - - /* Complex effects. These are used in the 'se' field of - * a struct re_se_params. Indexes into the se array - * are stored as instructions on nfa edges. - */ - RX_DEF_CPLX_SE(1, re_se_win, = 0) - RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1) - RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1) - RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1) - RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1) - RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1) - RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1) - -#endif - -#endif |