src - FreeBSD source tree

diff options


context:
space:
mode:

author	Jordan K. Hubbard <jkh@FreeBSD.org>	1993-06-18 04:22:21 +0000
committer	Jordan K. Hubbard <jkh@FreeBSD.org>	1993-06-18 04:22:21 +0000
commit	b76095a4307cc94ec7cd722853f9b032e45e6ea4 (patch)
tree	890f91d43eec35dc2f71a54410491f6503ca5b38 /gnu/usr.bin/awk
parent	7c434002a4e47486e9a2d7b2f32b1ddf42d37e2a (diff)
download	src-b76095a4307cc94ec7cd722853f9b032e45e6ea4.tar.gz src-b76095a4307cc94ec7cd722853f9b032e45e6ea4.zip

Updated GNU utilities

Notes

Notes: svn path=/cvs2svn/branches/unlabeled-1.1.1/; revision=9

Diffstat (limited to 'gnu/usr.bin/awk')

-rw-r--r--

gnu/usr.bin/awk/ACKNOWLEDGMENT

-rw-r--r--

gnu/usr.bin/awk/COPYING

340

-rw-r--r--

gnu/usr.bin/awk/FUTURES

120

-rw-r--r--

gnu/usr.bin/awk/LIMITATIONS

-rw-r--r--

gnu/usr.bin/awk/Makefile

-rw-r--r--

gnu/usr.bin/awk/NEWS

1295

-rw-r--r--

gnu/usr.bin/awk/PORTS

-rw-r--r--

gnu/usr.bin/awk/POSIX

-rw-r--r--

gnu/usr.bin/awk/PROBLEMS

-rw-r--r--

gnu/usr.bin/awk/README

116

-rw-r--r--

gnu/usr.bin/awk/array.c

293

-rw-r--r--

gnu/usr.bin/awk/awk.1

1873

-rw-r--r--

gnu/usr.bin/awk/awk.h

763

-rw-r--r--

gnu/usr.bin/awk/awk.y

1804

-rw-r--r--

gnu/usr.bin/awk/builtin.c

1133

-rw-r--r--

gnu/usr.bin/awk/config.h

272

-rw-r--r--

gnu/usr.bin/awk/dfa.c

2291

-rw-r--r--

gnu/usr.bin/awk/dfa.h

543

-rw-r--r--

gnu/usr.bin/awk/eval.c

1225

-rw-r--r--

gnu/usr.bin/awk/field.c

645

-rw-r--r--

gnu/usr.bin/awk/gawk.texi

11270

-rw-r--r--

gnu/usr.bin/awk/getopt.c

662

-rw-r--r--

gnu/usr.bin/awk/getopt.h

128

-rw-r--r--

gnu/usr.bin/awk/getopt1.c

160

-rw-r--r--

gnu/usr.bin/awk/io.c

1207

-rw-r--r--

gnu/usr.bin/awk/iop.c

318

-rw-r--r--

gnu/usr.bin/awk/main.c

731

-rw-r--r--

gnu/usr.bin/awk/msg.c

106

-rw-r--r--

gnu/usr.bin/awk/node.c

429

-rw-r--r--

gnu/usr.bin/awk/patchlevel.h

-rw-r--r--

gnu/usr.bin/awk/protos.h

115

-rw-r--r--

gnu/usr.bin/awk/re.c

208

-rw-r--r--

gnu/usr.bin/awk/regex.c

2854

-rw-r--r--

gnu/usr.bin/awk/regex.h

260

-rw-r--r--

gnu/usr.bin/awk/version.c

35 files changed, 31389 insertions, 0 deletions

diff --git a/gnu/usr.bin/awk/ACKNOWLEDGMENT b/gnu/usr.bin/awk/ACKNOWLEDGMENT
new file mode 100644
index 000000000000..b6c3b0b0c692
--- /dev/null
+++ b/gnu/usr.bin/awk/ACKNOWLEDGMENT

@@ -0,0 +1,21 @@

+The current developers of Gawk would like to thank and acknowledge the

+many people who have contributed to the development through bug reports

+and fixes and suggestions. Unfortunately, we have not been organized

+enough to keep track of all the names -- for that we apologize.

+Another group of people have assisted even more by porting Gawk to new

+platforms and providing a great deal of feedback. They are:

+ Hal Peterson <hrp@pecan.cray.com> (Cray)

+ Pat Rankin <gawk.rankin@EQL.Caltech.Edu> (VMS)

+ Michal Jaegermann <NTOMCZAK@vm.ucs.UAlberta.CA> (Atari, NeXT, DEC 3100)

+ Mike Lijewski <mjlx@eagle.cnsf.cornell.edu> (IBM RS6000)

+ Scott Deifik <scottd@amgen.com> (MSDOS 2.14)

+ Kent Williams (MSDOS 2.11)

+ Conrad Kwok (MSDOS earlier versions)

+ Scott Garfinkle (MSDOS earlier versions)

+Last, but far from least, we would like to thank Brian Kernighan who

+has helped to clear up many dark corners of the language and provided a

+restraining touch when we have been overly tempted by "feeping

+creaturism".

diff --git a/gnu/usr.bin/awk/COPYING b/gnu/usr.bin/awk/COPYING
new file mode 100644
index 000000000000..3358a7be862a
--- /dev/null
+++ b/gnu/usr.bin/awk/COPYING

@@ -0,0 +1,340 @@

+ GNU GENERAL PUBLIC LICENSE

+ Version 2, June 1991

+ 675 Mass Ave, Cambridge, MA 02139, USA

+ Everyone is permitted to copy and distribute verbatim copies

+ of this license document, but changing it is not allowed.

+ Preamble

+ The licenses for most software are designed to take away your

+freedom to share and change it. By contrast, the GNU General Public

+License is intended to guarantee your freedom to share and change free

+software--to make sure the software is free for all its users. This

+General Public License applies to most of the Free Software

+Foundation's software and to any other program whose authors commit to

+using it. (Some other Free Software Foundation software is covered by

+the GNU Library General Public License instead.) You can apply it to

+your programs, too.

+ When we speak of free software, we are referring to freedom, not

+price. Our General Public Licenses are designed to make sure that you

+have the freedom to distribute copies of free software (and charge for

+this service if you wish), that you receive source code or can get it

+if you want it, that you can change the software or use pieces of it

+in new free programs; and that you know you can do these things.

+ To protect your rights, we need to make restrictions that forbid

+anyone to deny you these rights or to ask you to surrender the rights.

+These restrictions translate to certain responsibilities for you if you

+distribute copies of the software, or if you modify it.

+ For example, if you distribute copies of such a program, whether

+gratis or for a fee, you must give the recipients all the rights that

+you have. You must make sure that they, too, receive or can get the

+source code. And you must show them these terms so they know their

+rights.

+ We protect your rights with two steps: (1) copyright the software, and

+(2) offer you this license which gives you legal permission to copy,

+distribute and/or modify the software.

+ Also, for each author's protection and ours, we want to make certain

+that everyone understands that there is no warranty for this free

+software. If the software is modified by someone else and passed on, we

+want its recipients to know that what they have is not the original, so

+that any problems introduced by others will not reflect on the original

+authors' reputations.

+ Finally, any free program is threatened constantly by software

+patents. We wish to avoid the danger that redistributors of a free

+program will individually obtain patent licenses, in effect making the

+program proprietary. To prevent this, we have made it clear that any

+patent must be licensed for everyone's free use or not licensed at all.

+ The precise terms and conditions for copying, distribution and

+modification follow.

+ GNU GENERAL PUBLIC LICENSE

+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

+ 0. This License applies to any program or other work which contains

+a notice placed by the copyright holder saying it may be distributed

+under the terms of this General Public License. The "Program", below,

+refers to any such program or work, and a "work based on the Program"

+means either the Program or any derivative work under copyright law:

+that is to say, a work containing the Program or a portion of it,

+either verbatim or with modifications and/or translated into another

+language. (Hereinafter, translation is included without limitation in

+the term "modification".) Each licensee is addressed as "you".

+Activities other than copying, distribution and modification are not

+covered by this License; they are outside its scope. The act of

+running the Program is not restricted, and the output from the Program

+is covered only if its contents constitute a work based on the

+Program (independent of having been made by running the Program).

+Whether that is true depends on what the Program does.

+ 1. You may copy and distribute verbatim copies of the Program's

+source code as you receive it, in any medium, provided that you

+conspicuously and appropriately publish on each copy an appropriate

+copyright notice and disclaimer of warranty; keep intact all the

+notices that refer to this License and to the absence of any warranty;

+and give any other recipients of the Program a copy of this License

+along with the Program.

+You may charge a fee for the physical act of transferring a copy, and

+you may at your option offer warranty protection in exchange for a fee.

+ 2. You may modify your copy or copies of the Program or any portion

+of it, thus forming a work based on the Program, and copy and

+distribute such modifications or work under the terms of Section 1

+above, provided that you also meet all of these conditions:

+ a) You must cause the modified files to carry prominent notices

+ stating that you changed the files and the date of any change.

+ b) You must cause any work that you distribute or publish, that in

+ whole or in part contains or is derived from the Program or any

+ part thereof, to be licensed as a whole at no charge to all third

+ parties under the terms of this License.

+ c) If the modified program normally reads commands interactively

+ when run, you must cause it, when started running for such

+ interactive use in the most ordinary way, to print or display an

+ announcement including an appropriate copyright notice and a

+ notice that there is no warranty (or else, saying that you provide

+ a warranty) and that users may redistribute the program under

+ these conditions, and telling the user how to view a copy of this

+ License. (Exception: if the Program itself is interactive but

+ does not normally print such an announcement, your work based on

+ the Program is not required to print an announcement.)

+These requirements apply to the modified work as a whole. If

+identifiable sections of that work are not derived from the Program,

+and can be reasonably considered independent and separate works in

+themselves, then this License, and its terms, do not apply to those

+sections when you distribute them as separate works. But when you

+distribute the same sections as part of a whole which is a work based

+on the Program, the distribution of the whole must be on the terms of

+this License, whose permissions for other licensees extend to the

+entire whole, and thus to each and every part regardless of who wrote it.

+Thus, it is not the intent of this section to claim rights or contest

+your rights to work written entirely by you; rather, the intent is to

+exercise the right to control the distribution of derivative or

+collective works based on the Program.

+In addition, mere aggregation of another work not based on the Program

+with the Program (or with a work based on the Program) on a volume of

+a storage or distribution medium does not bring the other work under

+the scope of this License.

+ 3. You may copy and distribute the Program (or a work based on it,

+under Section 2) in object code or executable form under the terms of

+Sections 1 and 2 above provided that you also do one of the following:

+ a) Accompany it with the complete corresponding machine-readable

+ source code, which must be distributed under the terms of Sections

+ 1 and 2 above on a medium customarily used for software interchange; or,

+ b) Accompany it with a written offer, valid for at least three

+ years, to give any third party, for a charge no more than your

+ cost of physically performing source distribution, a complete

+ machine-readable copy of the corresponding source code, to be

+ distributed under the terms of Sections 1 and 2 above on a medium

+ customarily used for software interchange; or,

+ c) Accompany it with the information you received as to the offer

+ to distribute corresponding source code. (This alternative is

+ allowed only for noncommercial distribution and only if you

+ received the program in object code or executable form with such

+ an offer, in accord with Subsection b above.)

+The source code for a work means the preferred form of the work for

+making modifications to it. For an executable work, complete source

+code means all the source code for all modules it contains, plus any

+associated interface definition files, plus the scripts used to

+control compilation and installation of the executable. However, as a

+special exception, the source code distributed need not include

+anything that is normally distributed (in either source or binary

+form) with the major components (compiler, kernel, and so on) of the

+operating system on which the executable runs, unless that component

+itself accompanies the executable.

+If distribution of executable or object code is made by offering

+access to copy from a designated place, then offering equivalent

+access to copy the source code from the same place counts as

+distribution of the source code, even though third parties are not

+compelled to copy the source along with the object code.

+ 4. You may not copy, modify, sublicense, or distribute the Program

+except as expressly provided under this License. Any attempt

+otherwise to copy, modify, sublicense or distribute the Program is

+void, and will automatically terminate your rights under this License.

+However, parties who have received copies, or rights, from you under

+this License will not have their licenses terminated so long as such

+parties remain in full compliance.

+ 5. You are not required to accept this License, since you have not

+signed it. However, nothing else grants you permission to modify or

+distribute the Program or its derivative works. These actions are

+prohibited by law if you do not accept this License. Therefore, by

+modifying or distributing the Program (or any work based on the

+Program), you indicate your acceptance of this License to do so, and

+all its terms and conditions for copying, distributing or modifying

+the Program or works based on it.

+ 6. Each time you redistribute the Program (or any work based on the

+Program), the recipient automatically receives a license from the

+original licensor to copy, distribute or modify the Program subject to

+these terms and conditions. You may not impose any further

+restrictions on the recipients' exercise of the rights granted herein.

+You are not responsible for enforcing compliance by third parties to

+this License.

+ 7. If, as a consequence of a court judgment or allegation of patent

+infringement or for any other reason (not limited to patent issues),

+conditions are imposed on you (whether by court order, agreement or

+otherwise) that contradict the conditions of this License, they do not

+excuse you from the conditions of this License. If you cannot

+distribute so as to satisfy simultaneously your obligations under this

+License and any other pertinent obligations, then as a consequence you

+may not distribute the Program at all. For example, if a patent

+license would not permit royalty-free redistribution of the Program by

+all those who receive copies directly or indirectly through you, then

+the only way you could satisfy both it and this License would be to

+refrain entirely from distribution of the Program.

+If any portion of this section is held invalid or unenforceable under

+any particular circumstance, the balance of the section is intended to

+apply and the section as a whole is intended to apply in other

+circumstances.

+It is not the purpose of this section to induce you to infringe any

+patents or other property right claims or to contest validity of any

+such claims; this section has the sole purpose of protecting the

+integrity of the free software distribution system, which is

+implemented by public license practices. Many people have made

+generous contributions to the wide range of software distributed

+through that system in reliance on consistent application of that

+system; it is up to the author/donor to decide if he or she is willing

+to distribute software through any other system and a licensee cannot

+impose that choice.

+This section is intended to make thoroughly clear what is believed to

+be a consequence of the rest of this License.

+ 8. If the distribution and/or use of the Program is restricted in

+certain countries either by patents or by copyrighted interfaces, the

+original copyright holder who places the Program under this License

+may add an explicit geographical distribution limitation excluding

+those countries, so that distribution is permitted only in or among

+countries not thus excluded. In such case, this License incorporates

+the limitation as if written in the body of this License.

+ 9. The Free Software Foundation may publish revised and/or new versions

+of the General Public License from time to time. Such new versions will

+be similar in spirit to the present version, but may differ in detail to

+address new problems or concerns.

+Each version is given a distinguishing version number. If the Program

+specifies a version number of this License which applies to it and "any

+later version", you have the option of following the terms and conditions

+either of that version or of any later version published by the Free

+Software Foundation. If the Program does not specify a version number of

+this License, you may choose any version ever published by the Free Software

+Foundation.

+ 10. If you wish to incorporate parts of the Program into other free

+programs whose distribution conditions are different, write to the author

+to ask for permission. For software which is copyrighted by the Free

+Software Foundation, write to the Free Software Foundation; we sometimes

+make exceptions for this. Our decision will be guided by the two goals

+of preserving the free status of all derivatives of our free software and

+of promoting the sharing and reuse of software generally.

+ NO WARRANTY

+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY

+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN

+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES

+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED

+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS

+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE

+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,

+REPAIR OR CORRECTION.

+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING

+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR

+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,

+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING

+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED

+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY

+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER

+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE

+POSSIBILITY OF SUCH DAMAGES.

+ END OF TERMS AND CONDITIONS

+ Appendix: How to Apply These Terms to Your New Programs

+ If you develop a new program, and you want it to be of the greatest

+possible use to the public, the best way to achieve this is to make it

+free software which everyone can redistribute and change under these terms.

+ To do so, attach the following notices to the program. It is safest

+to attach them to the start of each source file to most effectively

+convey the exclusion of warranty; and each file should have at least

+the "copyright" line and a pointer to where the full notice is found.

+ <one line to give the program's name and a brief idea of what it does.>

+ Copyright (C) 19yy <name of author>

+ This program is free software; you can redistribute it and/or modify

+ it under the terms of the GNU General Public License as published by

+ the Free Software Foundation; either version 2 of the License, or

+ (at your option) any later version.

+ This program is distributed in the hope that it will be useful,

+ but WITHOUT ANY WARRANTY; without even the implied warranty of

+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ GNU General Public License for more details.

+ You should have received a copy of the GNU General Public License

+ along with this program; if not, write to the Free Software

+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+Also add information on how to contact you by electronic and paper mail.

+If the program is interactive, make it output a short notice like this

+when it starts in an interactive mode:

+ Gnomovision version 69, Copyright (C) 19yy name of author

+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.

+ This is free software, and you are welcome to redistribute it

+ under certain conditions; type `show c' for details.

+The hypothetical commands `show w' and `show c' should show the appropriate

+parts of the General Public License. Of course, the commands you use may

+be called something other than `show w' and `show c'; they could even be

+mouse-clicks or menu items--whatever suits your program.

+You should also get your employer (if you work as a programmer) or your

+school, if any, to sign a "copyright disclaimer" for the program, if

+necessary. Here is a sample; alter the names:

+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program

+ `Gnomovision' (which makes passes at compilers) written by James Hacker.

+ <signature of Ty Coon>, 1 April 1989

+ Ty Coon, President of Vice

+This General Public License does not permit incorporating your program into

+proprietary programs. If your program is a subroutine library, you may

+consider it more useful to permit linking proprietary applications with the

+library. If this is what you want to do, use the GNU Library General

+Public License instead of this License.

diff --git a/gnu/usr.bin/awk/FUTURES b/gnu/usr.bin/awk/FUTURES
new file mode 100644
index 000000000000..b09656046b27
--- /dev/null
+++ b/gnu/usr.bin/awk/FUTURES

@@ -0,0 +1,120 @@

+This file lists future projects and enhancements for gawk. Items are listed

+in roughly the order they will be done for a given release. This file is

+mainly for use by the developers to help keep themselves on track, please

+don't bug us too much about schedules or what all this really means.

+For 2.16

+========

+David:

+ Move to autoconf-based configure system.

+ Allow RS to be a regexp.

+ RT variable to hold text of record terminator

+ RECLEN variable for fixed length records

+ Feedback alloca.s changes to FSF

+ Extensible hashing in memory of awk arrays

+ Split() with null string as third arg to split up strings

+ Analogously, setting FS="" would split the input record into individual

+ characters.

+Arnold:

+ Generalize IGNORECASE

+ any value makes it work, not just numeric non-zero

+ make it apply to *all* string comparisons

+ Fix FILENAME to have an initial value of "", not "-"

+ Clean up code by isolating system-specific functions in separate files.

+ Undertake significant directory reorganization.

+ Extensive manual cleanup:

+ Use of texinfo 2.0 features

+ Lots more examples

+ Document all of the above.

+In 2.17

+=======

+David:

+ Incorporate newer dfa.c and regex.c (go to POSIX regexps)

+ Make regex + dfa less dependant on gawk header file includes

+ General sub functions:

+ edit(line, pat, sub) and gedit(line, pat, sub)

+ that return the substituted strings and allow \1 etc. in the sub string.

+Arnold:

+ DBM storage of awk arrays. Try to allow multiple dbm packages

+ ? Have strftime() pay attention to the value of ENVIRON["TZ"]

+ Additional manual features:

+ Document posix regexps

+ Document use of dbm arrays

+ ? Add an error messages section to the manual

+ ? A section on where gawk is bounded

+ regex

+ i/o

+ sun fp conversions

+For 2.18

+========

+Arnold:

+ Add chdir and stat built-in functions.

+ Add function pointers as valid variable types.

+ Add an `ftw' built-in function that takes a function pointer.

+David:

+ Do an optimization pass over parse tree?

+For 2.19 or later:

+==================

+Add variables similar to C's __FILE__ and __LINE__ for better diagnostics

+from within awk programs.

+Add an explicit concatenation operator and assignment version.

+? Add a switch statement

+Add the ability to seek on an open file and retrieve the current file position.

+Add lint checking everywhere, including check for use of builtin vars.

+only in new awk.

+"restart" keyword

+Add |&

+Make awk '/foo/' files... run at egrep speeds

+Do a reference card

+Allow OFMT to be other than a floating point format.

+Allow redefining of builtin functions?

+Make it faster and smaller.

+For 3.x:

+========

+Create a gawk compiler?

+Create a gawk-to-C translator? (or C++??)

+Provide awk profiling and debugging.

diff --git a/gnu/usr.bin/awk/LIMITATIONS b/gnu/usr.bin/awk/LIMITATIONS
new file mode 100644
index 000000000000..5877197aeb55
--- /dev/null
+++ b/gnu/usr.bin/awk/LIMITATIONS

@@ -0,0 +1,14 @@

+This file describes limits of gawk on a Unix system (although it

+is variable even then). Non-Unix systems may have other limits.

+# of fields in a record: MAX_INT

+Length of input record: MAX_INT

+Length of output record: unlimited

+Size of a field: MAX_INT

+Size of a printf string: MAX_INT

+Size of a literal string: MAX_INT

+Characters in a character class: 2^(# of bits per byte)

+# of file redirections: unlimited

+# of pipe redirections: min(# of processes per user, # of open files)

+double-precision floating point

+Length of source line: unlimited

diff --git a/gnu/usr.bin/awk/Makefile b/gnu/usr.bin/awk/Makefile
new file mode 100644
index 000000000000..fdca82c4482e
--- /dev/null
+++ b/gnu/usr.bin/awk/Makefile

@@ -0,0 +1,13 @@

+PROG= awk

+SRCS= main.c eval.c builtin.c msg.c iop.c io.c field.c array.c \

+ node.c version.c re.c awk.c regex.c dfa.c \

+ getopt.c getopt1.c

+CFLAGS+= -DGAWK

+LDADD= -lm

+DPADD= ${LIBM}

+CLEANFILES+= awk.c y.tab.h

+MAN1= awk.0

+.include <bsd.prog.mk>

+.include "../../usr.bin/Makefile.inc"

diff --git a/gnu/usr.bin/awk/NEWS b/gnu/usr.bin/awk/NEWS
new file mode 100644
index 000000000000..6711373d6ea5
--- /dev/null
+++ b/gnu/usr.bin/awk/NEWS

@@ -0,0 +1,1295 @@

+Changes from 2.15.1 to 2.15.2

+---------------------------

+Additions to the FUTURES file.

+Document undefined order of output when using both standard output

+ and /dev/stdout or any of the /dev output files that gawk emulates in

+ the absence of OS support.

+Clean up the distribution generation in Makefile.in: the info files are

+ now included, the distributed files are marked read-only and patched

+ distributions are now unpacked in a directory named with the patch level.

+Changes from 2.15 to 2.15.1

+---------------------------

+Close stdout and stderr before all redirections on program exit. This allows

+ detection of write errors and also fixes the messages test on Solaris 2.x.

+Removed YYMAXDEPTH define in awk.y which was limiting the parser stack depth.

+Changes to config/bsd44, Makefile.bsd44 and configure to bring it into line

+ with the BSD4.4 release.

+Changed Makefile to use prefix, exec_prefix, bindir etc.

+make install now installs info files.

+make install now sets permissions on installed files.

+Make targets added: uninstall, distclean, mostlyclean and realclean.

+Added config.h to cleaner and clobber make targets.

+Changes to config/{hpux8x,sysv3,sysv4,ultrix41} to deal with alloca().

+Change to getopt.h for portability.

+Added more special cases to the getpgrp() call.

+Added README.ibmrt-aos and config/ibmrt-aos.

+Changes from 2.14 to 2.15

+---------------------------

+Command-line source can now be mixed with library functions.

+ARGIND variable tracks index in ARGV of FILENAME.

+GNU style long options in addition to short options.

+Plan 9 style special files interpreted by gawk:

+ /dev/pid

+ /dev/ppid

+ /dev/pgrpid

+ /dev/user

+ $1 = getuid

+ $2 = geteuid

+ $3 = getgid

+ $4 = getegid

+ $5 ... $NF = getgroups if supported

+ERRNO variable contains error string if getline or close fails.

+Very old options -a and -e have gone away.

+Inftest has been removed from the default target in test/Makefile -- the

+ results were too machine specific and resulted in too many false alarms.

+A README.amiga has been added.

+The "too many arguments supplied for format string" warning message is only

+ in effect under the lint option.

+Code improvements in dfa.c.

+Fixed all reported bugs:

+ Writes are checked for failure (such as full filesystem).

+ Stopped (at least some) runaway error messages.

+ gsub(/^/, "x") does the right thing for $0 of 0, 1, or more length.

+ close() on a command being piped to a getline now works properly.

+ The input record will no longer be freed upon an explicit close()

+ of the input file.

+ A NUL character in FS now works.

+ In a substitute, \\& now means a literal backslash followed by what

+ was matched.

+ Integer overflow of substring length in substr() is caught.

+ An input record without a newline termination is handled properly.

+ In io.c, check is against only EMFILE so that system file table

+ is not filled.

+ Renamed all files with names longer than 14 characters.

+ Escaped characters in regular expressions were being lost when

+ IGNORECASE was used.

+ Long source lines were not being handled properly.

+ Sourcefiles that ended in a tab but no newline were bombing.

+ Patterns that could match zero characters in split() were not working

+ properly.

+ The parsedebug option was not working.

+ The grammar was being a bit too lenient, allowing some very dubious

+ programs to pass.

+ Compilation with DEBUG defined now works.

+ A variable read in with getline was not being treated as a potential

+ number.

+ Array subscripts were not always of string type.

+Changes from 2.13.2 to 2.14

+---------------------------

+Updated manual!

+Added "next file" to skip efficiently to the next input file.

+Fixed potential of overflowing buffer in do_sprintf().

+Plugged small memory leak in sub_common().

+EOF on a redirect is now "sticky" -- it can only be cleared by close()ing

+ the pipe or file.

+Now works if used via a #! /bin/gawk line at the top of an executable file

+ when that line ends with whitespace.

+Added some checks to the grammar to catch redefinition of builtin functions.

+ This could eventually be the basis for an extension to allow redefining

+ functions, but in the mean time it's a good error catching facility.

+Negative integer exponents now work.

+Modified do_system() to make sure it had a non-null string to be passed

+ to system(3). Thus, system("") will flush any pending output but not go

+ through the overhead of forking an un-needed shell.

+A fix to floating point comparisons so that NaNs compare right on IEEE systems.

+Added code to make sure we're not opening directories for reading and such.

+Added code to do better diagnoses of weird or null file names.

+Allow continue outside of a loop, unless in strict posix mode. Lint option

+ will issue warning.

+New missing/strftime.c. There has been one chage that affects gawk. Posix

+ now defines a %V conversion so the vms conversion has been changed to %v.

+ If this version is used with gawk -Wlint and they use %V in a call to

+ strftime, they'll get a warning.

+Error messages now conform to GNU standard (I hope).

+Changed comparisons to conform to the description found in the file POSIX.

+ This is inconsistent with the current POSIX draft, but that is broken.

+ Hopefully the final POSIX standard will conform to this version.

+ (Alas, this will have to wait for 1003.2b, which will be a revision to

+ the 1003.2 standard. That standard has been frozen with the broken

+ comparison rules.)

+The length of a string was a short and now is a size_t.

+Updated VMS help.

+Added quite a few new tests to the test suite and deleted many due to lack of

+ written releases. Test output is only removed if it is identical to the

+ "good" output.

+Fixed a couple of bugs for reference to $0 when $0 is "" -- particularly in

+ a BEGIN block.

+Fixed premature freeing in construct "$0 = $0".

+Removed the call to wait_any() in gawk_popen(), since on at least some systems,

+ if gawk's input was from a pipe, the predecssor process in the pipe was a

+ child of gawk and this caused a deadlock.

+Regexp can (once again) match a newline, if given explicitly.

+nextopen() makes sure file name is null terminated.

+Fixed VMS pipe simulation. Improved VMS I/O performance.

+Catch . used in variable names.

+Fixed bug in getline without redirect from a file -- it was quitting after the

+ first EOF, rather than trying the next file.

+Fixed bug in treatment of backslash at the end of a string -- it was bombing

+ rather than doing something sensible. It is not clear what this should mean,

+ but for now I issue a warning and take it as a literal backslash.

+Moved setting of regexp syntax to before the option parsing in main(), to

+ handle things like -v FS='[.,;]'

+Fixed bug when NF is set by user -- fields_arr must be expanded if necessary

+ and "new" fields must be initialized.

+Fixed several bugs in [g]sub() for no match found or the match is 0-length.

+Fixed bug where in gsub() a pattern anchorred at the beginning would still

+ substitute throughout the string.

+make test does not assume the . is in PATH.

+Fixed bug when a field beyond the end of the record was requested after

+ $0 was altered (directly or indirectly).

+Fixed bug for assignment to field beyond end of record -- the assigned value

+ was not found on subsequent reference to that field.

+Fixed bug for FS a regexp and it matches at the end of a record.

+Fixed memory leak for an array local to a function.

+Fixed hanging of pipe redirection to getline

+Fixed coredump on access to $0 inside BEGIN block.

+Fixed treatment of RS = "". It now parses the fields correctly and strips

+ leading whitspace from a record if FS is a space.

+Fixed faking of /dev/stdin.

+Fixed problem with x += x

+Use of scalar as array and vice versa is now detected.

+IGNORECASE now obeyed for FS (even if FS is a single alphabetic character).

+Switch to GPL version 2.

+Renamed awk.tab.c to awktab.c for MSDOS and VMS tar programs.

+Renamed this file (CHANGES) to NEWS.

+Use fmod() instead of modf() and provide FMOD_MISSING #define to undo

+ this change.

+Correct the volatile declarations in eval.c.

+Avoid errant closing of the file descriptors for stdin, stdout and stderr.

+Be more flexible about where semi-colons can occur in programs.

+Check for write errors on all output, not just on close().

+Eliminate the need for missing/{strtol.c,vprintf.c}.

+Use GNU getopt and eliminate missing/getopt.c.

+More "lint" checking.

+Changes from 2.13.1 to 2.13.2

+-----------------------------

+Toward conformity with GNU standards, configure is a link to mkconf, the latter

+ to disappear in the next major release.

+Update to config/bsd43.

+Added config/apollo, config/msc60, config/cray2-50, config/interactive2.2

+sgi33.cc added for compilation using cc ratther than gcc.

+Ultrix41 now propagates to config.h properly -- as part of a general

+ mechanism in configure for kludges -- #define anything from a config file

+ just gets tacked onto the end of config.h -- to be used sparingly.

+Got rid of an unnecessary and troublesome declaration of vprintf().

+Small improvement in locality of error messages.

+Try to diagnose use of array as scalar and vice versa -- to be improved in

+ the future.

+Fix for last bug fix for Cray division code--sigh.

+More changes to test suite to explicitly use sh. Also get rid of

+ a few generated files.

+Fixed off-by-one bug in string concatenation code.

+Fix for use of array that is passed in from a previous function parameter.

+ Addition to test suite for above.

+A number of changes associated with changing NF and access to fields

+ beyond the end of the current record.

+Change to missing/memcmp.c to avoid seg. fault on zero length input.

+Updates to test suite (including some inadvertently left out of the last patch)

+ to invoke sh explicitly (rather than rely on #!/bin/sh) and remove some

+ junk files. test/chem/good updated to correspond to bug fixes.

+Changes from 2.13.0 to 2.13.1

+-----------------------------

+More configs and PORTS.

+Fixed bug wherein a simple division produced an erroneous FPE, caused by

+ the Cray division workaround -- that code is now #ifdef'd only for

+ Cray *and* fixed.

+Fixed bug in modulus implementation -- it was very close to the above

+ code, so I noticed it.

+Fixed portability problem with limits.h in missing.c

+Fixed portability problem with tzname and daylight -- define TZNAME_MISSING

+ if strftime() is missing and tzname is also.

+Better support for Latin-1 character set.

+Fixed portability problem in test Makefile.

+Updated PROBLEMS file.

+=============================== gawk-2.13 released =========================

+Changes from 2.12.42 to 2.12.43

+-------------------------------

+Typo in awk.y

+Fixed up strftime.3 and added doc. for %V.

+Changes from 2.12.41 to 2.12.42

+-------------------------------

+Fixed bug in devopen() -- if you had write permission in /dev,

+ it would just create /dev/stdout etc.!!

+Final (?) VMS update.

+Make NeXT use GFMT_WORKAROUND

+Fixed bug in sub_common() for substitute on zero-length match. Improved the

+ code a bit while I was at it.

+Fixed grammar so that $i++ parses as ($i)++

+Put support/* back in the distribution (didn't I already do this?!)

+Changes from 2.12.40 to 2.12.41

+-------------------------------

+VMS workaround for broken %g format.

+Changes from 2.12.39 to 2.12.40

+-------------------------------

+Minor man page update.

+Fixed latent bug in redirect().

+Changes from 2.12.38 to 2.12.39

+-------------------------------

+Updates to test suite -- remove dependence on changing gawk.1 man page.

+Changes from 2.12.37 to 2.12.38

+-------------------------------

+Fixed bug in use of *= without whitespace following.

+VMS update.

+Updates to man page.

+Option handling updates in main.c

+test/manyfiles redone and added to bigtest.

+Fixed latent (on Sun) bug in handling of save_fs.

+Changes from 2.12.36 to 2.12.37

+-------------------------------

+Update REL in Makefile-dist. Incorporate test suite into main distribution.

+Minor fix in regtest.

+Changes from 2.12.35 to 2.12.36

+-------------------------------

+Release takes on dual personality -- 2.12.36 and 2.13.0 -- any further

+ patches before public release won't count for 2.13, although they will for

+ 2.12 -- be careful to avoid confusion! patchlevel.h will be the last thing

+ to change.

+Cray updates to deal with arithmetic problems.

+Minor test suite updates.

+Fixed latent bug in parser (freeing memory).

+Changes from 2.12.34 to 2.12.35

+-------------------------------

+VMS updates.

+Flush stdout at top of err() and stderr at bottom.

+Fixed bug in eval_condition() -- it wasn't testing for MAYBE_NUM and

+ doing the force_number().

+Included the missing manyfiles.awk and a new test to catch the above bug which

+ I am amazed wasn't already caught by the test suite -- it's pretty basic.

+Changes from 2.12.33 to 2.12.34

+-------------------------------

+Atari updates -- including bug fix.

+More VMS updates -- also nuke vms/version.com.

+Fixed bug in handling of large numbers of redirections -- it was probably never

+ tested before (blush!).

+Minor rearrangement of code in r_force_number().

+Made chem and regtest tests a bit more portable (Ultrix again).

+Added another test -- manyfiles -- not invoked under any other test -- very Unix

+ specific.

+Rough beginning of LIMITATIONS file -- need my AWK book to complete it.

+Changes from 2.12.32 to 2.12.33

+-------------------------------

+Expunge debug.? from various files.

+Remove vestiges of Floor and Ceil kludge.

+Special case integer division -- mainly for Cray, but maybe someone else

+ will benefit.

+Workaround for iop_close closing an output pipe descriptor on Cray --

+ not conditional since I think it may fix a bug on SGI as well and I don't

+ think it can hurt elsewhere.

+Fixed memory leak in assoc_lookup().

+Small cleanup in test suite.

+Changes from 2.12.31 to 2.12.32

+-------------------------------

+Nuked debug.c and debugging flag -- there are better ways.

+Nuked version.sh and version.c in subdirectories.

+Fixed bug in handling of IGNORECASE.

+Fixed bug when FIELDWIDTHS was set via -v option.

+Fixed (obscure) bug when $0 is assigned a numerical value.

+Fixed so that escape sequences in command-line assignments work (as it already

+ said in the comment).

+Added a few cases to test suite.

+Moved support/* back into distribution.

+VMS updates.

+Changes from 2.12.30 to 2.12.31

+-------------------------------

+Cosmetic manual page changes.

+Updated sunos3 config.

+Small changes in test suite including renaming files over 14 chars. in length.

+Changes from 2.12.29 to 2.12.30

+-------------------------------

+Bug fix for many string concatenations in a row.

+Changes from 2.12.28 to 2.12.29

+-------------------------------

+Minor cleanup in awk.y

+Minor VMS update.

+Minor atari update.

+Changes from 2.12.27 to 2.12.28

+-------------------------------

+Got rid of the debugging goop in eval.c -- there are better ways.

+Sequent port.

+VMS changes left out of the last patch -- sigh! config/vms.h renamed

+ to config/vms-conf.h.

+Fixed missing/tzset.c

+Removed use of gcvt() and GCVT_MISSING -- turns out it was no faster than

+ sprintf("%g") and caused all sorts of portability headaches.

+Tuned get_field() -- it was unnecessarily parsing the whole record on reference

+ to $0.

+Tuned interpret() a bit in the rule_node loop.

+In r_force_number(), worked around bug in Uglix strtod() and got rid of

+ ugly do{}while(0) at Michal's urging.

+Replaced do_deref() and deref with unref(node) -- much cleaner and a bit faster.

+Got rid of assign_number() -- contrary to comment, it was no faster than

+ just making a new node and freeing the old one.

+Replaced make_number() and tmp_number() with macros that call mk_number().

+Changed freenode() and newnode() into macros -- the latter is getnode()

+ which calls more_nodes() as necessary.

+Changes from 2.12.26 to 2.12.27

+-------------------------------

+Completion of Cray 2 port (includes a kludge for floor() and ceil()

+ that may go or be changed -- I think that it may just be working around

+ a bug in chem that is being tweaked on the Cray).

+More VMS updates.

+Moved kludge over yacc's insertion of malloc and realloc declarations

+ from protos.h to the Makefile.

+Added a lisp interpreter in awk to the test suite. (Invoked under

+ bigtest.)

+Cleanup in r_force_number() -- I had never gotten around to a thorough

+ profile of the cache code and it turns out to be not worth it.

+Performance boost -- do lazy force_number()'ing for fields etc. i.e.

+ flag them (MAYBE_NUM) and call force_number only as necessary.

+Changes from 2.12.25 to 2.12.26

+-------------------------------

+Rework of regexp stuff so that dynamic regexps have reasonable

+ performance -- string used for compiled regexp is stored and

+ compared to new string -- if same, no recompilation is necessary.

+ Also, very dynamic regexps cause dfa-based searching to be turned

+ off.

+Code in dev_open() is back to returning fileno(std*) rather than

+ dup()ing it. This will be documented. Sorry for the run-around

+ on this.

+Minor atari updates.

+Minor vms update.

+Missing file from MSDOS port.

+Added warning (under lint) if third arg. of [g]sub is a constant and

+ handle it properly in the code (i.e. return how many matches).

+Changes from 2.12.24 to 2.12.25

+-------------------------------

+MSDOS port.

+Non-consequential changes to regexp variables in preparation for

+ a more serious change to fix a serious performance problem.

+Changes from 2.12.23 to 2.12.24

+-------------------------------

+Fixed bug in output flushing introduced a few patches back. This caused

+ serious performance losses.

+Changes from 2.12.22 to 2.12.23

+-------------------------------

+Accidently left config/cray2-60 out of last patch.

+Added some missing dependencies to Makefile.

+Cleaned up mkconf a bit; made yacc the default parser (no alloca needed,

+ right?); added rs6000 hook for signed characters.

+Made regex.c with NO_ALLOCA undefined work.

+Fixed bug in dfa.c for systems where free(NULL) bombs.

+Deleted a few cant_happen()'s that *really* can't hapen.

+Changes from 2.12.21 to 2.12.22

+-------------------------------

+Added to config stuff the ability to choose YACC rather than bison.

+Fixed CHAR_UNSIGNED in config.h-dist.

+Second arg. of strtod() is char ** rather than const char **.

+stackb is now initially malloc()'ed since it may be realloc()'ed.

+VMS updates.

+Added SIZE_T_MISSING to config stuff and a default typedef to awk.h.

+ (Maybe it is not needed on any current systems??)

+re_compile_pattern()'s size is now size_t unconditionally.

+Changes from 2.12.20 to 2.12.21

+-------------------------------

+Corrected missing/gcvt.c.

+Got rid of use of dup2() and thus DUP_MISSING.

+Updated config/sgi33.

+Turned on (and fixed) in cmp_nodes() the behaviour that I *hope* will be in

+ POSIX 1003.2 for relational comparisons.

+Small updates to test suite.

+Changes from 2.12.19 to 2.12.20

+-------------------------------

+Sloppy, sloppy, sloppy!! I didn't even try to compile the last two

+ patches. This one fixes goofs in regex.c.

+Changes from 2.12.18 to 2.12.19

+-------------------------------

+Cleanup of last patch.

+Changes from 2.12.17 to 2.12.18

+-------------------------------

+Makefile renamed to Makefile-dist.

+Added alloca() configuration to mkconf. (A bit kludgey.) Just

+ add a single line containing ALLOCA_PW, ALLOCA_S or ALLOCA_C

+ to the appropriate config file to have Makefile-dist edited

+ accordingly.

+Reorganized output flushing to correspond with new semantics of

+ devopen() on "/dev/std*" etc.

+Fixed rest of last goof!!

+Save and restore errno in do_pathopen().

+Miscellaneous atari updates.

+Get rid of the trailing comma in the NODETYPE definition (Cray

+ compiler won't take it).

+Try to make the use of `const' consistent since Cray compiler is

+ fussy about that. See the changes to `basename' and `myname'.

+It turns out that, according to section 3.8.3 (Macro Replacement)

+ of the ANSI Standard: ``If there are sequences of preprocessing

+ tokens within the list of arguments that would otherwise act as

+ preprocessing directives, the behavior is undefined.'' That means

+ that you cannot count on the behavior of the declaration of

+ re_compile_pattern in awk.h, and indeed the Cray compiler chokes on it.

+Replaced alloca with malloc/realloc/free in regex.c. It was much simpler

+ than expected. (Inside NO_ALLOCA for now -- by default no alloca.)

+Added a configuration file, config/cray60, for Unicos-6.0.

+Changes from 2.12.16 to 2.12.17

+-------------------------------

+Ooops. Goofed signal use in last patch.

+Changes from 2.12.15 to 2.12.16

+-------------------------------

+RENAMED *_dir to just * (e.g. missing_dir).

+Numerous VMS changes.

+Proper inclusion of atari and vms files.

+Added experimental (ifdef'd out) RELAXED_CONTINUATION and DEFAULT_FILETYPE

+ -- please comment on these!

+Moved pathopen() to io.c (sigh).

+Put local directory ahead in default AWKPATH.

+Added facility in mkconf to echo comments on stdout: lines beginning

+ with "#echo " will have the remainder of the line echoed when mkconf is run.

+ Any lines starting with "#" will otherwise be treated as comments. The

+ intent is to be able to say:

+ "#echo Make sure you uncomment alloca.c in the Makefile"

+ or the like.

+Prototype fix for V.4

+Fixed version_string to not print leading @(#).

+Fixed FIELDWIDTHS to work with strict (turned out to be easy).

+Fixed conf for V.2.

+Changed semantics of /dev/fd/n to be like on real /dev/fd.

+Several configuration and updates in the makefile.

+Updated manpage.

+Include tzset.c and system.c from missing_dir that were accidently left out of

+ the last patch.

+Fixed bug in cmdline variable assignment -- arg was getting freed(!) in

+ call to variable.

+Backed out of parse-time constant folding for now, until I can figure out

+ how to do it right.

+Fixed devopen() so that getline <"-" works.

+Changes from 2.12.14 to 2.12.15

+-------------------------------

+Changed config/* to a condensed form that can be used with mkconf to generate

+ a config.h from config.h-dist -- much easier to maintain. Please chaeck

+ carefully against what you had before for a particular system and report

+ any problems. vms.h remains separate since the stuff at the bottom

+ didn't quite fit the mkconf model -- hopefully cleared up later.

+Fixed bug in grammar -- didn't allow function definition to be separated from

+ other rules by a semi-colon.

+VMS fix to #includes in missing.c -- should we just be including awk.h?

+Updated README for texinfo.tex version.

+Updating of copyright in all .[chy] files.

+Added but commented out Michal's fix to strftime.

+Added tzset() emulation based on Rick Adams' code. Added TZSET_MISSING to

+ config.h-dist.

+Added strftime.3 man page for missing_dir

+More posix: func, **, **= don't work in -W posix

+More lint: ^, ^= not in old awk

+gawk.1: removed ref to -DNO_DEV_FD, other minor updating.

+Style change: pushbak becomes pushback() in yylex().

+Changes from 2.12.13 to 2.12.14

+-------------------------------

+Better (?) organization of awk.h -- attempt to keep all system dependencies

+ near the top and move some of the non-general things out of the config.h

+ files.

+Change to handling of SYSTEM_MISSING.

+Small change to ultrix config.

+Do "/dev/fd/*" etc. checking at runtime.

+First pass at VMS port.

+Improvements to error handling (when lexeme spans buffers).

+Fixed backslash handling -- why didn't I notice this sooner?

+Added programs from book to test suite and new target "bigtest" to Makefile.

+Changes from 2.12.12 to 2.12.13

+-------------------------------

+Recognize OFS and ORS specially so that OFS = 9 works without efficiency hit.

+ Took advantage of opportunity to tune do_print*() for about 10% win on a

+ print with 5 args (i.e. small but significant).

+Somewhat pervasive changes to reconcile CONVFMT vs. OFMT.

+Better initialization of builtin vars.

+Make config/* consistent wrt STRTOL_MISSING.

+Small portability improvement to alloca.s

+Improvements to lint code in awk.y

+Replaced strtol() with a better one by Chris Torek.

+Changes from 2.12.11 to 2.12.12

+-------------------------------

+Added PORTS file to record successful ports.

+Added #define const to nothing if not STDC and added const to strtod() header.

+Added * to printf capabilities and partially implemented ' ' and '+' (has an

+ effect for %d only, silently ignored for other formats). I'm afraid that's

+ as far as I want to go before I look at a complete replacement for

+ do_sprintf().

+Added warning for /regexp/ on LHS of MATCHOP.

+Changes from 2.12.10 to 2.12.11

+-------------------------------

+Small Makefile improvements.

+Some remaining nits from the NeXT port.

+Got rid of bcopy() define in awk.h -- not needed anymore (??)

+Changed private in builtin.c -- it is special on Sequent.

+Added subset implementation of strtol() and STRTOL_MISSING.

+A little bit of cleanup in debug.c, dfa.c.

+Changes from 2.12.9 to 2.12.10

+------------------------------

+Redid compatability checking and checking for # of args.

+Removed all references to variables[] from outside awk.y, in preparation

+ for a more abstract interface to the symbol table.

+Got rid of a remaining use of bcopy() in regex.c.

+Changes from 2.12.8 to 2.12.9

+-----------------------------

+Portability improvements for atari, next and decstation.

+Bug fix in substr() -- wasn't handling 3rd arg. of -1 properly.

+Manpage updates.

+Moved support from src release to doc release.

+Updated FUTURES file.

+Added some "lint" warnings.

+Changes from 2.12.7 to 2.12.8

+-----------------------------

+Changed time() to systime().

+Changed warning() in snode() to fatal().

+strftime() now defaults second arg. to current time.

+Changes from 2.12.6 to 2.12.7

+-----------------------------

+Fixed bug in sub_common() involving inadequate allocation of a buffer.

+Added some missing files to the Makefile.

+Changes from 2.12.5 to 2.12.6

+-----------------------------

+Fixed bug wherein non-redirected getline could call iop_close() just

+ prior to a call from do_input().

+Fixed bug in handling of /dev/stdout and /dev/stderr.

+Changes from 2.12.4 to 2.12.5

+-----------------------------

+Updated README and support directory.

+Changes from 2.12.3 to 2.12.4

+-----------------------------

+Updated CHANGES and TODO (should have been done in previous 2 patches).

+Changes from 2.12.2 to 2.12.3

+-----------------------------

+Brought regex.c and alloca.s into line with current FSF versions.

+Changes from 2.12.1 to 2.12.2

+-----------------------------

+Portability improvements; mostly moving system prototypes out of awk.h

+Introduction of strftime.

+Use of CONVFMT.

+Changes from 2.12 to 2.12.1

+-----------------------------

+Consolidated treatment of command-line assignments (thus correcting the

+-v treatment).

+Rationalized builtin-variable handling into a table-driven process, thus

+simplifying variable() and eliminating spc_var().

+Fixed bug in handling of command-line source that ended in a newline.

+Simplified install() and lookup().

+Did away with double-mallocing of identifiers and now free second and later

+instances of a name, after the first gets installed into the symbol table.

+Treat IGNORECASE specially, simplifying a lot of code, and allowing

+checking against strict conformance only on setting it, rather than on each

+pattern match.

+Fixed regexp matching when IGNORECASE is non-zero (broken when dfa.c was

+added).

+Fixed bug where $0 was not being marked as valid, even after it was rebuilt.

+This caused mangling of $0.

+Changes from 2.11.1 to 2.12

+-----------------------------

+Makefile:

+Portability improvements in Makefile.

+Move configuration stuff into config.h

+FSF files:

+Synchronized alloca.[cs] and regex.[ch] with FSF.

+array.c:

+Rationalized hash routines into one with a different algorithm.

+delete() now works if the array is a local variable.

+Changed interface of assoc_next() and avoided dereferencing past the end of the

+ array.

+awk.h:

+Merged non-prototype and prototype declarations in awk.h.

+Expanded tree_eval #define to short-circuit more calls of r_tree_eval().

+awk.y:

+Delinted some of the code in the grammar.

+Fixed and improved some of the error message printing.

+Changed to accomodate unlimited length source lines.

+Line continuation now works as advertised.

+Source lines can be arbitrarily long.

+Refined grammar hacks so that /= assignment works. Regular expressions

+ starting with /= are recognized at the beginning of a line, after && or ||

+ and after ~ or !~. More contexts can be added if necessary.

+Fixed IGNORECASE (multiple scans for backslash).

+Condensed expression_lists in array references.

+Detect and warn for correct # args in builtin functions -- call most of them

+ with a fixed number (i.e. fill in defaults at parse-time rather than at

+ run-time).

+Load ENVIRON only if it is referenced (detected at parse-time).

+Treat NF, FS, RS, NR, FNR specially at parse time, to improve run time.

+Fold constant expressions at parse time.

+Do make_regexp() on third arg. of split() at parse tiem if it is a constant.

+builtin.c:

+srand() returns 0 the first time called.

+Replaced alloca() with malloc() in do_sprintf().

+Fixed setting of RSTART and RLENGTH in do_match().

+Got rid of get_{one,two,three} and allowance for variable # of args. at

+ run-time -- this is now done at parse-time.

+Fixed latent bug in [g]sub whereby changes to $0 would never get made.

+Rewrote much of sub_common() for simplicity and performance.

+Added ctime() and time() builtin functions (unless -DSTRICT). ctime() returns

+ a time string like the C function, given the number of seconds since the epoch

+ and time() returns the current time in seconds.

+do_sprintf() now checks for mismatch between format string and number of

+ arguments supplied.

+dfa.c

+This is borrowed (almost unmodified) from GNU grep to provide faster searches.

+eval.c

+Node_var, Node_var_array and Node_param_list handled from macro rather

+ than in r_tree_eval().

+Changed cmp_nodes() to not do a force_number() -- this, combined with a

+ force_number() on ARGV[] and ENVIRON[] brings it into line with other awks

+Greatly simplified cmp_nodes().

+Separated out Node_NF, Node_FS, Node_RS, Node_NR and Node_FNR in get_lhs().

+All adjacent string concatenations now done at once.

+field.c

+Added support for FIELDWIDTHS.

+Fixed bug in get_field() whereby changes to a field were not always

+ properly reflected in $0.

+Reordered tests in parse_field() so that reference off the end of the buffer

+ doesn't happen.

+set_FS() now sets *parse_field i.e. routine to call depending on type of FS.

+It also does make_regexp() for FS if needed. get_field() passes FS_regexp

+ to re_parse_field(), as does do_split().

+Changes to set_field() and set_record() to avoid malloc'ing and free'ing the

+ field nodes repeatedly. The fields now just point into $0 unless they are

+ assigned to another variable or changed. force_number() on the field is

+ *only* done when the field is needed.

+gawk.1

+Fixed troff formatting problem on .TP lines.

+io.c

+Moved some code out into iop.c.

+Output from pipes and system() calls is properly synchronized.

+Status from pipe close properly returned.

+Bug in getline with no redirect fixed.

+iop.c

+This file contains a totally revamped get_a_record and associated code.

+main.c

+Command line programs no longer use a temporary file.

+Therefore, tmpnam() no longer required.

+Deprecated -a and -e options -- they will go away in the next release,

+ but for now they cause a warning.

+Moved -C, -V, -c options to -W ala posix.

+Added -W posix option: throw out \x

+Added -W lint option.

+node.c

+force_number() now allows pure numerics to have leading whitespace.

+Added make_string facility to optimize case of adding an already malloc'd

+ string.

+Cleaned up and simplified do_deref().

+Fixed bug in handling of stref==255 in do_deref().

+re.c

+contains the interface to regexp code

+Changes from 2.11.1 to FSF version of same

+------------------------------------------

+Thu Jan 4 14:19:30 1990 Jim Kingdon (kingdon at albert)

+ * Makefile (YACC): Add -y to bison part.

+ * missing.c: Add #include <stdio.h>.

+Sun Dec 24 16:16:05 1989 David J. MacKenzie (djm at hobbes.ai.mit.edu)

+ * * Makefile: Add (commented out) default defines for Sony News.

+ * awk.h: Move declaration of vprintf so it will compile when

+ -DVPRINTF_MISSING is defined.

+Mon Nov 13 18:54:08 1989 Robert J. Chassell (bob at apple-gunkies.ai.mit.edu)

+ * gawk.texinfo: changed @-commands that are not part of the

+ standard, currently released texinfmt.el to those that are.

+ Otherwise, only people with the as-yet unreleased makeinfo.c can

+ format this file.

+Changes from 2.11beta to 2.11.1 (production)

+--------------------------------------------

+Went from "beta" to production status!!!

+Now flushes stdout before closing pipes or redirected files to

+synchonize output.

+MS-DOS changes added in.

+Signal handler return type parameterized in Makefile and awk.h and

+some lint removed. debug.c cleaned up.

+Fixed FS splitting to never match null strings, per book.

+Correction to the manual's description of FS.

+Some compilers break on char *foo = "string" + 4 so fixed version.sh and

+main.c.

+Changes from 2.10beta to 2.11beta

+---------------------------------

+This release fixes all reported bugs that we could reproduce. Probably

+some of the changes are not documented here.

+The next release will probably not be a beta release!

+The most important change is the addition of the -nostalgia option. :-)

+The documentation has been improved and brought up-to-date.

+There has been a lot of general cleaning up of the code that is not otherwise

+documented here. There has been a movement toward using standard-conforming

+library routines and providing them (in missing.d) for systems lacking them.

+Improved (hopefully) configuration through Makfile modifications and missing.c.

+In particular, straightened out confusion over vprintf #defines, declarations

+etc.

+Deleted RCS log comments from source, to reduce source size by about one third.

+Most of them were horribly out-of-date, anyway.

+Renamed source files to reflect (for the most part) their contents.

+More and improved error messages. Cleanup and fixes to yyerror().

+String constants are not altered in input buffer, so error messages come out

+better. Fixed usage message. Make use of ANSI C strerror() function

+(provided).

+Plugged many more memory leaks. The memory consumption is now quite

+reasonable over a wide range of programs.

+Uses volatile declaration if STDC > 0 to avoid problems due to longjmp.

+New -a and -e options to use awk or egrep style regexps, respectively,

+since POSIX says awk should use egrep regexps. Default is -a.

+Added -v option for setting variables before the first file is encountered.

+Version information now uses -V and copyleft uses -C.

+Added a patchlevel.h file and its use for -V and -C.

+Append_right() optimized for major improvement to programs with a *lot*

+of statements.

+Operator precedence has been corrected to match draft Posix.

+Tightened up grammar for builtin functions so that only length

+may be called without arguments or parentheses.

+/regex/ is now a normal expression that can appear in any expression

+context.

+Allow /= to begin a regexp. Allow ..[../..].. in a regexp.

+Allow empty compound statements ({}).

+Made return and next illegal outside a function and in BEGIN/END respectively.

+Division by zero is now illegal and causes a fatal error.

+Fixed exponentiation so that x ^ 0 and x ^= 0 both return 1.

+Fixed do_sqrt, do_log, and do_exp to do argument/return checking and

+print an error message, per the manual.

+Fixed main to catch SIGSEGV to get source and data file line numbers.

+Fixed yyerror to print the ^ at the beginning of the bad token, not the end.

+Fix to substr() builtin: it was failing if the arguments

+weren't already strings.

+Added new node value flag NUMERIC to indicate that a variable is

+purely a number as opposed to type NUM which indicates that

+the node's numeric value is valid. This is set in make_number(),

+tmp_number and r_force_number() when appropriate and used in

+cmp_nodes(). This fixed a bug in comparison of variables that had

+numeric prefixes. The new code uses strtod() and eliminates is_a_number().

+A simple strtod() is provided for systems lacking one. It does no

+overflow checking, so could be improved.

+Simplification and efficiency improvement in force_string.

+Added performance tweak in r_force_number().

+Fixed a bug with nested loops and break/continue in functions.

+Fixed inconsistency in handling of empty fields when $0 has to be rebuilt.

+Happens to simplify rebuild_record().

+Cleaned up the code associated with opening a pipe for reading. Gawk

+now has its own popen routine (gawk_popen) that allocates an IOBUF

+and keeps track of the pid of the child process. gawk_pclose

+marks the appropriate child as defunct in the right struct redirect.

+Cleaned up and fixed close_redir().

+Fixed an obscure bug to do with redirection. Intermingled ">" and ">>"

+redirects did not output in a predictable order.

+Improved handling of output bufferring: now all print[f]s redirected to a tty

+or pipe are flushed immediately and non-redirected output to a tty is flushed

+before the next input record is read.

+Fixed a bug in get_a_record() where bcopy() could have copied over

+a random pointer.

+Fixed a bug when RS="" and records separated by multiple blank lines.

+Got rid of SLOWIO code which was out-of-date anyway.

+Fix in get_field() for case where $0 is changed and then $(n) are

+changed and then $0 is used.

+Fixed infinite loop on failure to open file for reading from getline.

+Now handles redirect file open failures properly.

+Filenames such as /dev/stdin now allowed on the command line as well as

+in redirects.

+Fixed so that gawk '$1' where $1 is a zero tests false.

+Fixed parsing so that `RLENGTH -1' parses the same as `RLENGTH - 1',

+for example.

+The return from a user-defined function now defaults to the Null node.

+This fixes a core-dump-causing bug when the return value of a function

+is used and that function returns no value.

+Now catches floating point exceptions to avoid core dumps.

+Bug fix for deleting elements of an array -- under some conditions, it was

+deleting more than one element at a time.

+Fix in AWKPATH code for running off the end of the string.

+Fixed handling of precision in *printf calls. %0.2d now works properly,

+as does %c. [s]printf now recognizes %i and %X.

+Fixed a bug in printing of very large (>240) strings.

+Cleaned up erroneous behaviour for RS == "".

+Added IGNORECASE support to index().

+Simplified and fixed newnode/freenode.

+Fixed reference to $(anything) in a BEGIN block.

+Eliminated use of USG rand48().

+Bug fix in force_string for machines with 16-bit ints.

+Replaced use of mktemp() with tmpnam() and provided a partial implementation of

+the latter for systems that don't have it.

+Added a portability check for includes in io.c.

+Minor portability fix in alloc.c plus addition of xmalloc().

+Portability fix: on UMAX4.2, st_blksize is zero for a pipe, thus breaking

+iop_alloc() -- fixed.

+Workaround for compiler bug on Sun386i in do_sprintf.

+More and improved prototypes in awk.h.

+Consolidated C escape parsing code into one place.

+strict flag is now turned on only when invoked with compatability option.

+It now applies to fewer things.

+Changed cast of f._ptr in vprintf.c from (unsigned char *) to (char *).

+Hopefully this is right for the systems that use this code (I don't).

+Support for pipes under MSDOS added.

diff --git a/gnu/usr.bin/awk/PORTS b/gnu/usr.bin/awk/PORTS
new file mode 100644
index 000000000000..95e133f9dd03
--- /dev/null
+++ b/gnu/usr.bin/awk/PORTS

@@ -0,0 +1,32 @@

+A recent version of gawk has been successfully compiled and run "make test"

+on the following:

+Sun 4/490 running 4.1

+NeXT running 2.0

+DECstation 3100 running Ultrix 4.0 or Ultrix 3.1 (different config)

+AtariST (16-bit ints, gcc compiler, byacc, running under TOS)

+ESIX V.3.2 Rev D (== System V Release 3.2), the 386. compiler was gcc + bison

+IBM RS/6000 (see README.rs6000)

+486 running SVR4, using cc and bison

+SGI running IRIX 3.3 using gcc (fails with cc)

+Sequent Balance running Dynix V3.1

+Cray Y-MP8 running Unicos 6.0.11

+Cray 2 running Unicos 6.1 (modulo trailing zeroes in chem)

+VAX/VMS V5.x (should also work on 4.6 and 4.7)

+VMS POSIX V1.0, V1.1

+OpenVMS AXP V1.0

+MSDOS - Microsoft C 5.1, compiles and runs very simple testing

+BSD 4.4alpha

+From: ghazi@caip.rutgers.edu (Kaveh R. Ghazi):

+arch configured as:

+---- --------------

+Hpux 9.0 hpux8x

+NeXTStep 2.0 next20

+Sgi Irix 4.0.5 (/bin/cc) sgi405.cc (new file)

+Stardent Titan 1500 OSv2.5 sysv3

+Stardent Vistra (i860) SVR4 sysv4

+SunOS 4.1.2 sunos41

+Tektronix XD88 (UTekV 3.2e) sysv3

+Ultrix 4.2 ultrix41

diff --git a/gnu/usr.bin/awk/POSIX b/gnu/usr.bin/awk/POSIX
new file mode 100644
index 000000000000..f2405420aedf
--- /dev/null
+++ b/gnu/usr.bin/awk/POSIX

@@ -0,0 +1,95 @@

+Right now, the numeric vs. string comparisons are screwed up in draft

+11.2. What prompted me to check it out was the note in gnu.bug.utils

+which observed that gawk was doing the comparison $1 == "000"

+numerically. I think that we can agree that intuitively, this should

+be done as a string comparison. Version 2.13.2 of gawk follows the

+current POSIX draft. Following is how I (now) think this

+stuff should be done.

+1. A numeric literal or the result of a numeric operation has the NUMERIC

+ attribute.

+2. A string literal or the result of a string operation has the STRING

+ attribute.

+3. Fields, getline input, FILENAME, ARGV elements, ENVIRON elements and the

+ elements of an array created by split() that are numeric strings

+ have the STRNUM attribute. Otherwise, they have the STRING attribute.

+ Uninitialized variables also have the STRNUM attribute.

+4. Attributes propagate across assignments, but are not changed by

+ any use. (Although a use may cause the entity to acquire an additional

+ value such that it has both a numeric and string value -- this leaves the

+ attribute unchanged.)

+When two operands are compared, either string comparison or numeric comparison

+may be used, depending on the attributes of the operands, according to the

+following (symmetric) matrix:

+ +----------------------------------------------

+ | STRING NUMERIC STRNUM

+--------+----------------------------------------------

+ |

+STRING | string string string

+ |

+NUMERIC | string numeric numeric

+ |

+STRNUM | string numeric numeric

+--------+----------------------------------------------

+So, the following program should print all OKs.

+echo '0e2 0a 0 0b

+0e2 0a 0 0b' |

+$AWK '

+NR == 1 {

+ num = 0

+ str = "0e2"

+ print ++test ": " ( (str == "0e2") ? "OK" : "OOPS" )

+ print ++test ": " ( ("0e2" != 0) ? "OK" : "OOPS" )

+ print ++test ": " ( ("0" != $2) ? "OK" : "OOPS" )

+ print ++test ": " ( ("0e2" == $1) ? "OK" : "OOPS" )

+ print ++test ": " ( (0 == "0") ? "OK" : "OOPS" )

+ print ++test ": " ( (0 == num) ? "OK" : "OOPS" )

+ print ++test ": " ( (0 != $2) ? "OK" : "OOPS" )

+ print ++test ": " ( (0 == $1) ? "OK" : "OOPS" )

+ print ++test ": " ( ($1 != "0") ? "OK" : "OOPS" )

+ print ++test ": " ( ($1 == num) ? "OK" : "OOPS" )

+ print ++test ": " ( ($2 != 0) ? "OK" : "OOPS" )

+ print ++test ": " ( ($2 != $1) ? "OK" : "OOPS" )

+ print ++test ": " ( ($3 == 0) ? "OK" : "OOPS" )

+ print ++test ": " ( ($3 == $1) ? "OK" : "OOPS" )

+ print ++test ": " ( ($2 != $4) ? "OK" : "OOPS" ) # 15

+ a = "+2"

+ b = 2

+ if (NR % 2)

+ c = a + b

+ print ++test ": " ( (a != b) ? "OK" : "OOPS" ) # 16 and 22

+ d = "2a"

+ b = 2

+ if (NR % 2)

+ c = d + b

+ print ++test ": " ( (d != b) ? "OK" : "OOPS" )

+ print ++test ": " ( (d + 0 == b) ? "OK" : "OOPS" )

+ e = "2"

+ print ++test ": " ( (e == b "") ? "OK" : "OOPS" )

+ a = "2.13"

+ print ++test ": " ( (a == 2.13) ? "OK" : "OOPS" )

+ a = "2.130000"

+ print ++test ": " ( (a != 2.13) ? "OK" : "OOPS" )

+ if (NR == 2) {

+ CONVFMT = "%.6f"

+ print ++test ": " ( (a == 2.13) ? "OK" : "OOPS" )

+ }

+}'

diff --git a/gnu/usr.bin/awk/PROBLEMS b/gnu/usr.bin/awk/PROBLEMS
new file mode 100644
index 000000000000..3b7c5148bd8e
--- /dev/null
+++ b/gnu/usr.bin/awk/PROBLEMS

@@ -0,0 +1,6 @@

+This is a list of known problems in gawk 2.15.

+Hopefully they will all be fixed in the next major release of gawk.

+Please keep in mind that the code is still undergoing significant evolution.

+1. Gawk's printf is probably still not POSIX compliant.

diff --git a/gnu/usr.bin/awk/README b/gnu/usr.bin/awk/README
new file mode 100644
index 000000000000..f4bd3df806c8
--- /dev/null
+++ b/gnu/usr.bin/awk/README

@@ -0,0 +1,116 @@

+README:

+This is GNU Awk 2.15. It should be upwardly compatible with the

+System V Release 4 awk. It is almost completely compliant with draft 11.3

+of POSIX 1003.2.

+This release adds new features -- see NEWS for details.

+See the installation instructions, below.

+Known problems are given in the PROBLEMS file. Work to be done is

+described briefly in the FUTURES file. Verified ports are listed in

+the PORTS file. Changes in this version are summarized in the CHANGES file.

+Please read the LIMITATIONS and ACKNOWLEDGMENT files.

+Read the file POSIX for a discussion of how the standard says comparisons

+should be done vs. how they really should be done and how gawk does them.

+To format the documentation with TeX, you must use texinfo.tex 2.53

+or later. Otherwise footnotes look unacceptable.

+If you wish to remake the Info files, you should use makeinfo. The 2.15

+version of makeinfo works with no errors.

+The man page is up to date.

+INSTALLATION:

+Check whether there is a system-specific README file for your system.

+Makefile.in may need some tailoring. The only changes necessary should

+be to change installation targets or to change compiler flags.

+The changes to make in Makefile.in are commented and should be obvious.

+All other changes should be made in a config file. Samples for

+various systems are included in the config directory. Starting with

+2.11, our intent has been to make the code conform to standards (ANSI,

+POSIX, SVID, in that order) whenever possible, and to not penalize

+standard conforming systems. We have included substitute versions of

+routines not universally available. Simply add the appropriate define

+for the missing feature(s) on your system.

+If you have neither bison nor yacc, use the awktab.c file here. It was

+generated with bison, and should have no AT&T code in it. (Note that

+modifying awk.y without bison or yacc will be difficult, at best. You might

+want to get a copy of bison from the FSF too.)

+If no config file is included for your system, start by copying one

+for a similar system. One way of determining the defines needed is to

+try to load gawk with nothing defined and see what routines are

+unresolved by the loader. This should give you a good idea of how to

+proceed.

+The next release will use the FSF autoconfig program, so we are no longer

+soliciting new config files.

+If you have an MS-DOS system, use the stuff in the pc directory.

+For an Atari there is an atari directory and similarly one for VMS.

+Chapter 16 of The GAWK Manual discusses configuration in detail.

+After successful compilation, do 'make test' to run a small test

+suite. There should be no output from the 'cmp' invocations except in

+the cases where there are small differences in floating point values.

+If there are other differences, please investigate and report the

+problem.

+PRINTING THE MANUAL

+The 'support' directory contains texinfo.tex 2.65, which will be necessary

+for printing the manual, and the texindex.c program from the texinfo

+distribution which is also necessary. See the makefile for the steps needed

+to get a DVI file from the manual.

+CAVEATS

+The existence of a patchlevel.h file does *N*O*T* imply a commitment on

+our part to issue bug fixes or patches. It is there in case we should

+decide to do so.

+BUG REPORTS AND FIXES (Un*x systems):

+Please coordinate changes through David Trueman and/or Arnold Robbins.

+David Trueman

+Department of Mathematics, Statistics and Computing Science,

+Dalhousie University, Halifax, Nova Scotia, Canada

+UUCP: {uunet utai watmath}!dalcs!david

+INTERNET: david@cs.dal.ca

+Arnold Robbins

+1736 Reindeer Drive

+Atlanta, GA, 30329, USA

+INTERNET: arnold@skeeve.atl.ga.us

+UUCP: { gatech, emory, emoryu1 }!skeeve!arnold

+BUG REPORTS AND FIXES (non-Unix ports):

+MS-DOS:

+ Scott Deifik

+ AMGEN Inc.

+ Amgen Center, Bldg.17-Dept.393

+ Thousand Oaks, CA 91320-1789

+ Tel-805-499-5725 ext.4677

+ Fax-805-498-0358

+ scottd@amgen.com

+VMS:

+ Pat Rankin

+ rankin@eql.caltech.edu (e-mail only)

+Atari ST:

+ Michal Jaegermann

+ NTOMCZAK@vm.ucs.UAlberta.CA (e-mail only)

diff --git a/gnu/usr.bin/awk/array.c b/gnu/usr.bin/awk/array.c
new file mode 100644
index 000000000000..59be340c04df
--- /dev/null
+++ b/gnu/usr.bin/awk/array.c

@@ -0,0 +1,293 @@

+/*

+ * array.c - routines for associative arrays.

+ */

+/*

+ *

+ * This file is part of GAWK, the GNU implementation of the

+ * AWK Progamming Language.

+ *

+ * GAWK is free software; you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation; either version 2 of the License, or

+ * (at your option) any later version.

+ *

+ * GAWK is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GAWK; see the file COPYING. If not, write to

+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+ */

+#include "awk.h"

+static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1));

+NODE *

+concat_exp(tree)

+register NODE *tree;

+ register NODE *r;

+ char *str;

+ char *s;

+ unsigned len;

+ int offset;

+ int subseplen;

+ char *subsep;

+ if (tree->type != Node_expression_list)

+ return force_string(tree_eval(tree));

+ r = force_string(tree_eval(tree->lnode));

+ if (tree->rnode == NULL)

+ return r;

+ subseplen = SUBSEP_node->lnode->stlen;

+ subsep = SUBSEP_node->lnode->stptr;

+ len = r->stlen + subseplen + 2;

+ emalloc(str, char *, len, "concat_exp");

+ memcpy(str, r->stptr, r->stlen+1);

+ s = str + r->stlen;

+ free_temp(r);

+ tree = tree->rnode;

+ while (tree) {

+ if (subseplen == 1)

+ *s++ = *subsep;

+ else {

+ memcpy(s, subsep, subseplen+1);

+ s += subseplen;

+ }

+ r = force_string(tree_eval(tree->lnode));

+ len += r->stlen + subseplen;

+ offset = s - str;

+ erealloc(str, char *, len, "concat_exp");

+ s = str + offset;

+ memcpy(s, r->stptr, r->stlen+1);

+ s += r->stlen;

+ free_temp(r);

+ tree = tree->rnode;

+ }

+ r = make_str_node(str, s - str, ALREADY_MALLOCED);

+ r->flags |= TEMP;

+ return r;

+/* Flush all the values in symbol[] before doing a split() */

+void

+assoc_clear(symbol)

+NODE *symbol;

+ int i;

+ NODE *bucket, *next;

+ if (symbol->var_array == 0)

+ return;

+ for (i = 0; i < HASHSIZE; i++) {

+ for (bucket = symbol->var_array[i]; bucket; bucket = next) {

+ next = bucket->ahnext;

+ unref(bucket->ahname);

+ unref(bucket->ahvalue);

+ freenode(bucket);

+ }

+ symbol->var_array[i] = 0;

+ }

+/*

+ * calculate the hash function of the string in subs

+ */

+unsigned int

+hash(s, len)

+register char *s;

+register int len;

+ register unsigned long h = 0, g;

+ while (len--) {

+ h = (h << 4) + *s++;

+ g = (h & 0xf0000000);

+ if (g) {

+ h = h ^ (g >> 24);

+ h = h ^ g;

+ }

+ if (h < HASHSIZE)

+ return h;

+ else

+ return h%HASHSIZE;

+/*

+ * locate symbol[subs]

+ */

+static NODE * /* NULL if not found */

+assoc_find(symbol, subs, hash1)

+NODE *symbol;

+register NODE *subs;

+int hash1;

+ register NODE *bucket, *prev = 0;

+ for (bucket = symbol->var_array[hash1]; bucket; bucket = bucket->ahnext) {

+ if (cmp_nodes(bucket->ahname, subs) == 0) {

+ if (prev) { /* move found to front of chain */

+ prev->ahnext = bucket->ahnext;

+ bucket->ahnext = symbol->var_array[hash1];

+ symbol->var_array[hash1] = bucket;

+ }

+ return bucket;

+ } else

+ prev = bucket; /* save previous list entry */

+ }

+ return NULL;

+/*

+ * test whether the array element symbol[subs] exists or not

+ */

+int

+in_array(symbol, subs)

+NODE *symbol, *subs;

+ register int hash1;

+ if (symbol->type == Node_param_list)

+ symbol = stack_ptr[symbol->param_cnt];

+ if (symbol->var_array == 0)

+ return 0;

+ subs = concat_exp(subs); /* concat_exp returns a string node */

+ hash1 = hash(subs->stptr, subs->stlen);

+ if (assoc_find(symbol, subs, hash1) == NULL) {

+ free_temp(subs);

+ return 0;

+ } else {

+ free_temp(subs);

+ return 1;

+ }

+/*

+ * SYMBOL is the address of the node (or other pointer) being dereferenced.

+ * SUBS is a number or string used as the subscript.

+ *

+ * Find SYMBOL[SUBS] in the assoc array. Install it with value "" if it

+ * isn't there. Returns a pointer ala get_lhs to where its value is stored

+ */

+NODE **

+assoc_lookup(symbol, subs)

+NODE *symbol, *subs;

+ register int hash1;

+ register NODE *bucket;

+ (void) force_string(subs);

+ hash1 = hash(subs->stptr, subs->stlen);

+ if (symbol->var_array == 0) { /* this table really should grow

+ * dynamically */

+ unsigned size;

+ size = sizeof(NODE *) * HASHSIZE;

+ emalloc(symbol->var_array, NODE **, size, "assoc_lookup");

+ memset((char *)symbol->var_array, 0, size);

+ symbol->type = Node_var_array;

+ } else {

+ bucket = assoc_find(symbol, subs, hash1);

+ if (bucket != NULL) {

+ free_temp(subs);

+ return &(bucket->ahvalue);

+ }

+ /* It's not there, install it. */

+ if (do_lint && subs->stlen == 0)

+ warning("subscript of array `%s' is null string",

+ symbol->vname);

+ getnode(bucket);

+ bucket->type = Node_ahash;

+ if (subs->flags & TEMP)

+ bucket->ahname = dupnode(subs);

+ else {

+ unsigned int saveflags = subs->flags;

+ subs->flags &= ~MALLOC;

+ bucket->ahname = dupnode(subs);

+ subs->flags = saveflags;

+ }

+ free_temp(subs);

+ /* array subscripts are strings */

+ bucket->ahname->flags &= ~NUMBER;

+ bucket->ahname->flags |= STRING;

+ bucket->ahvalue = Nnull_string;

+ bucket->ahnext = symbol->var_array[hash1];

+ symbol->var_array[hash1] = bucket;

+ return &(bucket->ahvalue);

+void

+do_delete(symbol, tree)

+NODE *symbol, *tree;

+ register int hash1;

+ register NODE *bucket, *last;

+ NODE *subs;

+ if (symbol->type == Node_param_list)

+ symbol = stack_ptr[symbol->param_cnt];

+ if (symbol->var_array == 0)

+ return;

+ subs = concat_exp(tree); /* concat_exp returns string node */

+ hash1 = hash(subs->stptr, subs->stlen);

+ last = NULL;

+ for (bucket = symbol->var_array[hash1]; bucket; last = bucket, bucket = bucket->ahnext)

+ if (cmp_nodes(bucket->ahname, subs) == 0)

+ break;

+ free_temp(subs);

+ if (bucket == NULL)

+ return;

+ if (last)

+ last->ahnext = bucket->ahnext;

+ else

+ symbol->var_array[hash1] = bucket->ahnext;

+ unref(bucket->ahname);

+ unref(bucket->ahvalue);

+ freenode(bucket);

+void

+assoc_scan(symbol, lookat)

+NODE *symbol;

+struct search *lookat;

+ if (!symbol->var_array) {

+ lookat->retval = NULL;

+ return;

+ }

+ lookat->arr_ptr = symbol->var_array;

+ lookat->arr_end = lookat->arr_ptr + HASHSIZE; /* added */

+ lookat->bucket = symbol->var_array[0];

+ assoc_next(lookat);

+void

+assoc_next(lookat)

+struct search *lookat;

+ while (lookat->arr_ptr < lookat->arr_end) {

+ if (lookat->bucket != 0) {

+ lookat->retval = lookat->bucket->ahname;

+ lookat->bucket = lookat->bucket->ahnext;

+ return;

+ }

+ lookat->arr_ptr++;

+ if (lookat->arr_ptr < lookat->arr_end)

+ lookat->bucket = *(lookat->arr_ptr);

+ else

+ lookat->retval = NULL;

+ }

+ return;

diff --git a/gnu/usr.bin/awk/awk.1 b/gnu/usr.bin/awk/awk.1
new file mode 100644
index 000000000000..0338485e8db8
--- /dev/null
+++ b/gnu/usr.bin/awk/awk.1

@@ -0,0 +1,1873 @@

+.ds PX \s-1POSIX\s+1

+.ds UX \s-1UNIX\s+1

+.ds AN \s-1ANSI\s+1

+.TH GAWK 1 "Apr 15 1993" "Free Software Foundation" "Utility Commands"

+.SH NAME

+gawk \- pattern scanning and processing language

+.SH SYNOPSIS

+.B gawk

+[ POSIX or GNU style options ]

+.B \-f

+.I program-file

+.B \-\^\-

+] file .\^.\^.

+.br

+.B gawk

+[ POSIX or GNU style options ]

+.B \-\^\-

+.I program-text

+file .\^.\^.

+.SH DESCRIPTION

+.I Gawk

+is the GNU Project's implementation of the AWK programming language.

+It conforms to the definition of the language in

+the \*(PX 1003.2 Command Language And Utilities Standard.

+This version in turn is based on the description in

+.IR "The AWK Programming Language" ,

+by Aho, Kernighan, and Weinberger,

+with the additional features defined in the System V Release 4 version

+of \*(UX

+.IR awk .

+.I Gawk

+also provides some GNU-specific extensions.

+.PP

+The command line consists of options to

+.I gawk

+itself, the AWK program text (if not supplied via the

+.B \-f

+or

+.B \-\^\-file

+options), and values to be made

+available in the

+.B ARGC

+and

+.B ARGV

+pre-defined AWK variables.

+.SH OPTIONS

+.PP

+.I Gawk

+options may be either the traditional \*(PX one letter options,

+or the GNU style long options. \*(PX style options start with a single ``\-'',

+while GNU long options start with ``\-\^\-''.

+GNU style long options are provided for both GNU-specific features and

+for \*(PX mandated features. Other implementations of the AWK language

+are likely to only accept the traditional one letter options.

+.PP

+Following the \*(PX standard,

+.IR gawk -specific

+options are supplied via arguments to the

+.B \-W

+option. Multiple

+.B \-W

+options may be supplied, or multiple arguments may be supplied together

+if they are separated by commas, or enclosed in quotes and separated

+by white space.

+Case is ignored in arguments to the

+.B \-W

+option.

+Each

+.B \-W

+option has a corresponding GNU style long option, as detailed below.

+.PP

+.I Gawk

+accepts the following options.

+.TP

+.PD 0

+.BI \-F " fs"

+.TP

+.PD

+.BI \-\^\-field-separator= fs

+Use

+.I fs

+for the input field separator (the value of the

+.B FS

+predefined

+variable).

+.TP

+.PD 0

+\fB\-v\fI var\fB\^=\^\fIval\fR

+.TP

+.PD

+\fB\-\^\-assign=\fIvar\fB\^=\^\fIval\fR

+Assign the value

+.IR val ,

+to the variable

+.IR var ,

+before execution of the program begins.

+Such variable values are available to the

+.B BEGIN

+block of an AWK program.

+.TP

+.PD 0

+.BI \-f " program-file"

+.TP

+.PD

+.BI \-\^\-file= program-file

+Read the AWK program source from the file

+.IR program-file ,

+instead of from the first command line argument.

+Multiple

+.B \-f

+(or

+.BR \-\^\-file )

+options may be used.

+.TP \w'\fB\-\^\-copyright\fR'u+1n

+.PD 0

+.B "\-W compat"

+.TP

+.PD

+.B \-\^\-compat

+Run in

+.I compatibility

+mode. In compatibility mode,

+.I gawk

+behaves identically to \*(UX

+.IR awk ;

+none of the GNU-specific extensions are recognized.

+See

+.BR "GNU EXTENSIONS" ,

+below, for more information.

+.TP

+.PD 0

+.B "\-W copyleft"

+.TP

+.PD 0

+.B "\-W copyright"

+.TP

+.PD 0

+.B \-\^\-copyleft

+.TP

+.PD

+.B \-\^\-copyright

+Print the short version of the GNU copyright information message on

+the error output.

+.TP

+.PD 0

+.B "\-W help"

+.TP

+.PD 0

+.B "\-W usage"

+.TP

+.PD 0

+.B \-\^\-help

+.TP

+.PD

+.B \-\^\-usage

+Print a relatively short summary of the available options on

+the error output.

+.TP

+.PD 0

+.B "\-W lint"

+.TP

+.PD 0

+.B \-\^\-lint

+Provide warnings about constructs that are

+dubious or non-portable to other AWK implementations.

+.ig

+.\" This option is left undocumented, on purpose.

+.TP

+.PD 0

+.B "\-W nostalgia"

+.TP

+.PD

+.B \-\^\-nostalgia

+Provide a moment of nostalgia for long time

+.I awk

+users.

+..

+.TP

+.PD 0

+.B "\-W posix"

+.TP

+.PD

+.B \-\^\-posix

+This turns on

+.I compatibility

+mode, with the following additional restrictions:

+.RS

+.TP \w'\(bu'u+1n

+\(bu

+.B \ex

+escape sequences are not recognized.

+.TP

+\(bu

+The synonym

+.B func

+for the keyword

+.B function

+is not recognized.

+.TP

+\(bu

+The operators

+.B **

+and

+.B **=

+cannot be used in place of

+.B ^

+and

+.BR ^= .

+.RE

+.TP

+.PD 0

+.BI "\-W source=" program-text

+.TP

+.PD

+.BI \-\^\-source= program-text

+Use

+.I program-text

+as AWK program source code.

+This option allows the easy intermixing of library functions (used via the

+.B \-f

+and

+.B \-\^\-file

+options) with source code entered on the command line.

+It is intended primarily for medium to large size AWK programs used

+in shell scripts.

+.sp .5

+The

+.B "\-W source="

+form of this option uses the rest of the command line argument for

+.IR program-text ;

+no other options to

+.B \-W

+will be recognized in the same argument.

+.TP

+.PD 0

+.B "\-W version"

+.TP

+.PD

+.B \-\^\-version

+Print version information for this particular copy of

+.I gawk

+on the error output.

+This is useful mainly for knowing if the current copy of

+.I gawk

+on your system

+is up to date with respect to whatever the Free Software Foundation

+is distributing.

+.TP

+.B \-\^\-

+Signal the end of options. This is useful to allow further arguments to the

+AWK program itself to start with a ``\-''.

+This is mainly for consistency with the argument parsing convention used

+by most other \*(PX programs.

+.PP

+Any other options are flagged as illegal, but are otherwise ignored.

+.SH AWK PROGRAM EXECUTION

+.PP

+An AWK program consists of a sequence of pattern-action statements

+and optional function definitions.

+.RS

+.PP

+\fIpattern\fB { \fIaction statements\fB }\fR

+.br

+\fBfunction \fIname\fB(\fIparameter list\fB) { \fIstatements\fB }\fR

+.RE

+.PP

+.I Gawk

+first reads the program source from the

+.IR program-file (s)

+if specified, or from the first non-option argument on the command line.

+The

+.B \-f

+option may be used multiple times on the command line.

+.I Gawk

+will read the program text as if all the

+.IR program-file s

+had been concatenated together. This is useful for building libraries

+of AWK functions, without having to include them in each new AWK

+program that uses them. To use a library function in a file from a

+program typed in on the command line, specify

+.B /dev/tty

+as one of the

+.IR program-file s,

+type your program, and end it with a

+.B ^D

+(control-d).

+.PP

+The environment variable

+.B AWKPATH

+specifies a search path to use when finding source files named with

+the

+.B \-f

+option. If this variable does not exist, the default path is

+\fB".:/usr/lib/awk:/usr/local/lib/awk"\fR.

+If a file name given to the

+.B \-f

+option contains a ``/'' character, no path search is performed.

+.PP

+.I Gawk

+executes AWK programs in the following order.

+First,

+.I gawk

+compiles the program into an internal form.

+Next, all variable assignments specified via the

+.B \-v

+option are performed. Then,

+.I gawk

+executes the code in the

+.B BEGIN

+block(s) (if any),

+and then proceeds to read

+each file named in the

+.B ARGV

+array.

+If there are no files named on the command line,

+.I gawk

+reads the standard input.

+.PP

+If a filename on the command line has the form

+.IB var = val

+it is treated as a variable assignment. The variable

+.I var

+will be assigned the value

+.IR val .

+(This happens after any

+.B BEGIN

+block(s) have been run.)

+Command line variable assignment

+is most useful for dynamically assigning values to the variables

+AWK uses to control how input is broken into fields and records. It

+is also useful for controlling state if multiple passes are needed over

+a single data file.

+.PP

+If the value of a particular element of

+.B ARGV

+is empty (\fB""\fR),

+.I gawk

+skips over it.

+.PP

+For each line in the input,

+.I gawk

+tests to see if it matches any

+.I pattern

+in the AWK program.

+For each pattern that the line matches, the associated

+.I action

+is executed.

+The patterns are tested in the order they occur in the program.

+.PP

+Finally, after all the input is exhausted,

+.I gawk

+executes the code in the

+.B END

+block(s) (if any).

+.SH VARIABLES AND FIELDS

+AWK variables are dynamic; they come into existence when they are

+first used. Their values are either floating-point numbers or strings,

+or both,

+depending upon how they are used. AWK also has one dimension

+arrays; multiply dimensioned arrays may be simulated.

+Several pre-defined variables are set as a program

+runs; these will be described as needed and summarized below.

+.SS Fields

+.PP

+As each input line is read,

+.I gawk

+splits the line into

+.IR fields ,

+using the value of the

+.B FS

+variable as the field separator.

+If

+.B FS

+is a single character, fields are separated by that character.

+Otherwise,

+.B FS

+is expected to be a full regular expression.

+In the special case that

+.B FS

+is a single blank, fields are separated

+by runs of blanks and/or tabs.

+Note that the value of

+.B IGNORECASE

+(see below) will also affect how fields are split when

+.B FS

+is a regular expression.

+.PP

+If the

+.B FIELDWIDTHS

+variable is set to a space separated list of numbers, each field is

+expected to have fixed width, and

+.I gawk

+will split up the record using the specified widths. The value of

+.B FS

+is ignored.

+Assigning a new value to

+.B FS

+overrides the use of

+.BR FIELDWIDTHS ,

+and restores the default behavior.

+.PP

+Each field in the input line may be referenced by its position,

+.BR $1 ,

+.BR $2 ,

+and so on.

+.B $0

+is the whole line. The value of a field may be assigned to as well.

+Fields need not be referenced by constants:

+.RS

+.PP

+.ft B

+n = 5

+.br

+print $n

+.ft R

+.RE

+.PP

+prints the fifth field in the input line.

+The variable

+.B NF

+is set to the total number of fields in the input line.

+.PP

+References to non-existent fields (i.e. fields after

+.BR $NF )

+produce the null-string. However, assigning to a non-existent field

+(e.g.,

+.BR "$(NF+2) = 5" )

+will increase the value of

+.BR NF ,

+create any intervening fields with the null string as their value, and

+cause the value of

+.B $0

+to be recomputed, with the fields being separated by the value of

+.BR OFS .

+.SS Built-in Variables

+.PP

+AWK's built-in variables are:

+.PP

+.TP \w'\fBFIELDWIDTHS\fR'u+1n

+.B ARGC

+The number of command line arguments (does not include options to

+.IR gawk ,

+or the program source).

+.TP

+.B ARGIND

+The index in

+.B ARGV

+of the current file being processed.

+.TP

+.B ARGV

+Array of command line arguments. The array is indexed from

+0 to

+.B ARGC

+\- 1.

+Dynamically changing the contents of

+.B ARGV

+can control the files used for data.

+.TP

+.B CONVFMT

+The conversion format for numbers, \fB"%.6g"\fR, by default.

+.TP

+.B ENVIRON

+An array containing the values of the current environment.

+The array is indexed by the environment variables, each element being

+the value of that variable (e.g., \fBENVIRON["HOME"]\fP might be

+.BR /u/arnold ).

+Changing this array does not affect the environment seen by programs which

+.I gawk

+spawns via redirection or the

+.B system()

+function.

+(This may change in a future version of

+.IR gawk .)

+.\" but don't hold your breath...

+.TP

+.B ERRNO

+If a system error occurs either doing a redirection for

+.BR getline ,

+during a read for

+.BR getline ,

+or during a

+.BR close ,

+then

+.B ERRNO

+will contain

+a string describing the error.

+.TP

+.B FIELDWIDTHS

+A white-space separated list of fieldwidths. When set,

+.I gawk

+parses the input into fields of fixed width, instead of using the

+value of the

+.B FS

+variable as the field separator.

+The fixed field width facility is still experimental; expect the

+semantics to change as

+.I gawk

+evolves over time.

+.TP

+.B FILENAME

+The name of the current input file.

+If no files are specified on the command line, the value of

+.B FILENAME

+is ``\-''.

+.TP

+.B FNR

+The input record number in the current input file.

+.TP

+.B FS

+The input field separator, a blank by default.

+.TP

+.B IGNORECASE

+Controls the case-sensitivity of all regular expression operations. If

+.B IGNORECASE

+has a non-zero value, then pattern matching in rules,

+field splitting with

+.BR FS ,

+regular expression

+matching with

+.B ~

+and

+.BR !~ ,

+and the

+.BR gsub() ,

+.BR index() ,

+.BR match() ,

+.BR split() ,

+and

+.B sub()

+pre-defined functions will all ignore case when doing regular expression

+operations. Thus, if

+.B IGNORECASE

+is not equal to zero,

+.B /aB/

+matches all of the strings \fB"ab"\fP, \fB"aB"\fP, \fB"Ab"\fP,

+and \fB"AB"\fP.

+As with all AWK variables, the initial value of

+.B IGNORECASE

+is zero, so all regular expression operations are normally case-sensitive.

+.TP

+.B NF

+The number of fields in the current input record.

+.TP

+.B NR

+The total number of input records seen so far.

+.TP

+.B OFMT

+The output format for numbers, \fB"%.6g"\fR, by default.

+.TP

+.B OFS

+The output field separator, a blank by default.

+.TP

+.B ORS

+The output record separator, by default a newline.

+.TP

+.B RS

+The input record separator, by default a newline.

+.B RS

+is exceptional in that only the first character of its string

+value is used for separating records.

+(This will probably change in a future release of

+.IR gawk .)

+If

+.B RS

+is set to the null string, then records are separated by

+blank lines.

+When

+.B RS

+is set to the null string, then the newline character always acts as

+a field separator, in addition to whatever value

+.B FS

+may have.

+.TP

+.B RSTART

+The index of the first character matched by

+.BR match() ;

+0 if no match.

+.TP

+.B RLENGTH

+The length of the string matched by

+.BR match() ;

+\-1 if no match.

+.TP

+.B SUBSEP

+The character used to separate multiple subscripts in array

+elements, by default \fB"\e034"\fR.

+.SS Arrays

+.PP

+Arrays are subscripted with an expression between square brackets

+.RB ( [ " and " ] ).

+If the expression is an expression list

+.RI ( expr ", " expr " ...)"

+then the array subscript is a string consisting of the

+concatenation of the (string) value of each expression,

+separated by the value of the

+.B SUBSEP

+variable.

+This facility is used to simulate multiply dimensioned

+arrays. For example:

+.PP

+.RS

+.ft B

+i = "A" ;\^ j = "B" ;\^ k = "C"

+.br

+x[i, j, k] = "hello, world\en"

+.ft R

+.RE

+.PP

+assigns the string \fB"hello, world\en"\fR to the element of the array

+.B x

+which is indexed by the string \fB"A\e034B\e034C"\fR. All arrays in AWK

+are associative, i.e. indexed by string values.

+.PP

+The special operator

+.B in

+may be used in an

+.B if

+or

+.B while

+statement to see if an array has an index consisting of a particular

+value.

+.PP

+.RS

+.ft B

+.nf

+if (val in array)

+ print array[val]

+.fi

+.ft

+.RE

+.PP

+If the array has multiple subscripts, use

+.BR "(i, j) in array" .

+.PP

+The

+.B in

+construct may also be used in a

+.B for

+loop to iterate over all the elements of an array.

+.PP

+An element may be deleted from an array using the

+.B delete

+statement.

+.SS Variable Typing And Conversion

+.PP

+Variables and fields

+may be (floating point) numbers, or strings, or both. How the

+value of a variable is interpreted depends upon its context. If used in

+a numeric expression, it will be treated as a number, if used as a string

+it will be treated as a string.

+.PP

+To force a variable to be treated as a number, add 0 to it; to force it

+to be treated as a string, concatenate it with the null string.

+.PP

+When a string must be converted to a number, the conversion is accomplished

+using

+.IR atof (3).

+A number is converted to a string by using the value of

+.B CONVFMT

+as a format string for

+.IR sprintf (3),

+with the numeric value of the variable as the argument.

+However, even though all numbers in AWK are floating-point,

+integral values are

+.I always

+converted as integers. Thus, given

+.PP

+.RS

+.ft B

+.nf

+CONVFMT = "%2.2f"

+a = 12

+b = a ""

+.fi

+.ft R

+.RE

+.PP

+the variable

+.B b

+has a value of \fB"12"\fR and not \fB"12.00"\fR.

+.PP

+.I Gawk

+performs comparisons as follows:

+If two variables are numeric, they are compared numerically.

+If one value is numeric and the other has a string value that is a

+``numeric string,'' then comparisons are also done numerically.

+Otherwise, the numeric value is converted to a string and a string

+comparison is performed.

+Two strings are compared, of course, as strings.

+According to the \*(PX standard, even if two strings are

+numeric strings, a numeric comparison is performed. However, this is

+clearly incorrect, and

+.I gawk

+does not do this.

+.PP

+Uninitialized variables have the numeric value 0 and the string value ""

+(the null, or empty, string).

+.SH PATTERNS AND ACTIONS

+AWK is a line oriented language. The pattern comes first, and then the

+action. Action statements are enclosed in

+.B {

+and

+.BR } .

+Either the pattern may be missing, or the action may be missing, but,

+of course, not both. If the pattern is missing, the action will be

+executed for every single line of input.

+A missing action is equivalent to

+.RS

+.PP

+.B "{ print }"

+.RE

+.PP

+which prints the entire line.

+.PP

+Comments begin with the ``#'' character, and continue until the

+end of the line.

+Blank lines may be used to separate statements.

+Normally, a statement ends with a newline, however, this is not the

+case for lines ending in

+a ``,'', ``{'', ``?'', ``:'', ``&&'', or ``||''.

+Lines ending in

+.B do

+or

+.B else

+also have their statements automatically continued on the following line.

+In other cases, a line can be continued by ending it with a ``\e'',

+in which case the newline will be ignored.

+.PP

+Multiple statements may

+be put on one line by separating them with a ``;''.

+This applies to both the statements within the action part of a

+pattern-action pair (the usual case),

+and to the pattern-action statements themselves.

+.SS Patterns

+AWK patterns may be one of the following:

+.PP

+.RS

+.nf

+.B BEGIN

+.B END

+.BI / "regular expression" /

+.I "relational expression"

+.IB pattern " && " pattern

+.IB pattern " || " pattern

+.IB pattern " ? " pattern " : " pattern

+.BI ( pattern )

+.BI ! " pattern"

+.IB pattern1 ", " pattern2

+.fi

+.RE

+.PP

+.B BEGIN

+and

+.B END

+are two special kinds of patterns which are not tested against

+the input.

+The action parts of all

+.B BEGIN

+patterns are merged as if all the statements had

+been written in a single

+.B BEGIN

+block. They are executed before any

+of the input is read. Similarly, all the

+.B END

+blocks are merged,

+and executed when all the input is exhausted (or when an

+.B exit

+statement is executed).

+.B BEGIN

+and

+.B END

+patterns cannot be combined with other patterns in pattern expressions.

+.B BEGIN

+and

+.B END

+patterns cannot have missing action parts.

+.PP

+For

+.BI / "regular expression" /

+patterns, the associated statement is executed for each input line that matches

+the regular expression.

+Regular expressions are the same as those in

+.IR egrep (1),

+and are summarized below.

+.PP

+.I "relational expression"

+may use any of the operators defined below in the section on actions.

+These generally test whether certain fields match certain regular expressions.

+.PP

+The

+.BR && ,

+.BR || ,

+and

+.B !

+operators are logical AND, logical OR, and logical NOT, respectively, as in C.

+They do short-circuit evaluation, also as in C, and are used for combining

+more primitive pattern expressions. As in most languages, parentheses

+may be used to change the order of evaluation.

+.PP

+The

+.B ?\^:

+operator is like the same operator in C. If the first pattern is true

+then the pattern used for testing is the second pattern, otherwise it is

+the third. Only one of the second and third patterns is evaluated.

+.PP

+The

+.IB pattern1 ", " pattern2

+form of an expression is called a range pattern.

+It matches all input records starting with a line that matches

+.IR pattern1 ,

+and continuing until a record that matches

+.IR pattern2 ,

+inclusive. It does not combine with any other sort of pattern expression.

+.SS Regular Expressions

+Regular expressions are the extended kind found in

+.IR egrep .

+They are composed of characters as follows:

+.TP \w'\fB[^\fIabc...\fB]\fR'u+2n

+.I c

+matches the non-metacharacter

+.IR c .

+.TP

+.I \ec

+matches the literal character

+.IR c .

+.TP

+.B .

+matches any character except newline.

+.TP

+.B ^

+matches the beginning of a line or a string.

+.TP

+.B $

+matches the end of a line or a string.

+.TP

+.BI [ abc... ]

+character class, matches any of the characters

+.IR abc... .

+.TP

+.BI [^ abc... ]

+negated character class, matches any character except

+.I abc...

+and newline.

+.TP

+.IB r1 | r2

+alternation: matches either

+.I r1

+or

+.IR r2 .

+.TP

+.I r1r2

+concatenation: matches

+.IR r1 ,

+and then

+.IR r2 .

+.TP

+.IB r +

+matches one or more

+.IR r 's.

+.TP

+.IB r *

+matches zero or more

+.IR r 's.

+.TP

+.IB r ?

+matches zero or one

+.IR r 's.

+.TP

+.BI ( r )

+grouping: matches

+.IR r .

+.PP

+The escape sequences that are valid in string constants (see below)

+are also legal in regular expressions.

+.SS Actions

+Action statements are enclosed in braces,

+.B {

+and

+.BR } .

+Action statements consist of the usual assignment, conditional, and looping

+statements found in most languages. The operators, control statements,

+and input/output statements

+available are patterned after those in C.

+.SS Operators

+.PP

+The operators in AWK, in order of increasing precedence, are

+.PP

+.TP "\w'\fB*= /= %= ^=\fR'u+1n"

+.PD 0

+.B "= += \-="

+.TP

+.PD

+.B "*= /= %= ^="

+Assignment. Both absolute assignment

+.BI ( var " = " value )

+and operator-assignment (the other forms) are supported.

+.TP

+.B ?:

+The C conditional expression. This has the form

+.IB expr1 " ? " expr2 " : " expr3\c

+\&. If

+.I expr1

+is true, the value of the expression is

+.IR expr2 ,

+otherwise it is

+.IR expr3 .

+Only one of

+.I expr2

+and

+.I expr3

+is evaluated.

+.TP

+.B ||

+Logical OR.

+.TP

+.B &&

+Logical AND.

+.TP

+.B "~ !~"

+Regular expression match, negated match.

+.B NOTE:

+Do not use a constant regular expression

+.RB ( /foo/ )

+on the left-hand side of a

+.B ~

+or

+.BR !~ .

+Only use one on the right-hand side. The expression

+.BI "/foo/ ~ " exp

+has the same meaning as \fB(($0 ~ /foo/) ~ \fIexp\fB)\fR.

+This is usually

+.I not

+what was intended.

+.TP

+.PD 0

+.B "< >"

+.TP

+.PD 0

+.B "<= >="

+.TP

+.PD

+.B "!= =="

+The regular relational operators.

+.TP

+.I blank

+String concatenation.

+.TP

+.B "+ \-"

+Addition and subtraction.

+.TP

+.B "* / %"

+Multiplication, division, and modulus.

+.TP

+.B "+ \- !"

+Unary plus, unary minus, and logical negation.

+.TP

+.B ^

+Exponentiation (\fB**\fR may also be used, and \fB**=\fR for

+the assignment operator).

+.TP

+.B "++ \-\^\-"

+Increment and decrement, both prefix and postfix.

+.TP

+.B $

+Field reference.

+.SS Control Statements

+.PP

+The control statements are

+as follows:

+.PP

+.RS

+.nf

+\fBif (\fIcondition\fB) \fIstatement\fR [ \fBelse\fI statement \fR]

+\fBwhile (\fIcondition\fB) \fIstatement \fR

+\fBdo \fIstatement \fBwhile (\fIcondition\fB)\fR

+\fBfor (\fIexpr1\fB; \fIexpr2\fB; \fIexpr3\fB) \fIstatement\fR

+\fBfor (\fIvar \fBin\fI array\fB) \fIstatement\fR

+\fBbreak\fR

+\fBcontinue\fR

+\fBdelete \fIarray\^\fB[\^\fIindex\^\fB]\fR

+\fBexit\fR [ \fIexpression\fR ]

+\fB{ \fIstatements \fB}

+.fi

+.RE

+.SS "I/O Statements"

+.PP

+The input/output statements are as follows:

+.PP

+.TP "\w'\fBprintf \fIfmt, expr-list\fR'u+1n"

+.BI close( filename )

+Close file (or pipe, see below).

+.TP

+.B getline

+Set

+.B $0

+from next input record; set

+.BR NF ,

+.BR NR ,

+.BR FNR .

+.TP

+.BI "getline <" file

+Set

+.B $0

+from next record of

+.IR file ;

+set

+.BR NF .

+.TP

+.BI getline " var"

+Set

+.I var

+from next input record; set

+.BR NF ,

+.BR FNR .

+.TP

+.BI getline " var" " <" file

+Set

+.I var

+from next record of

+.IR file .

+.TP

+.B next

+Stop processing the current input record. The next input record

+is read and processing starts over with the first pattern in the

+AWK program. If the end of the input data is reached, the

+.B END

+block(s), if any, are executed.

+.TP

+.B "next file"

+Stop processing the current input file. The next input record read

+comes from the next input file.

+.B FILENAME

+is updated,

+.B FNR

+is reset to 1, and processing starts over with the first pattern in the

+AWK program. If the end of the input data is reached, the

+.B END

+block(s), if any, are executed.

+.TP

+.B print

+Prints the current record.

+.TP

+.BI print " expr-list"

+Prints expressions.

+.TP

+.BI print " expr-list" " >" file

+Prints expressions on

+.IR file .

+.TP

+.BI printf " fmt, expr-list"

+Format and print.

+.TP

+.BI printf " fmt, expr-list" " >" file

+Format and print on

+.IR file .

+.TP

+.BI system( cmd-line )

+Execute the command

+.IR cmd-line ,

+and return the exit status.

+(This may not be available on non-\*(PX systems.)

+.PP

+Other input/output redirections are also allowed. For

+.B print

+and

+.BR printf ,

+.BI >> file

+appends output to the

+.IR file ,

+while

+.BI | " command"

+writes on a pipe.

+In a similar fashion,

+.IB command " | getline"

+pipes into

+.BR getline .

+.BR Getline

+will return 0 on end of file, and \-1 on an error.

+.SS The \fIprintf\fP\^ Statement

+.PP

+The AWK versions of the

+.B printf

+statement and

+.B sprintf()

+function

+(see below)

+accept the following conversion specification formats:

+.TP

+.B %c

+An \s-1ASCII\s+1 character.

+If the argument used for

+.B %c

+is numeric, it is treated as a character and printed.

+Otherwise, the argument is assumed to be a string, and the only first

+character of that string is printed.

+.TP

+.B %d

+A decimal number (the integer part).

+.TP

+.B %i

+Just like

+.BR %d .

+.TP

+.B %e

+A floating point number of the form

+.BR [\-]d.ddddddE[+\^\-]dd .

+.TP

+.B %f

+A floating point number of the form

+.BR [\-]ddd.dddddd .

+.TP

+.B %g

+Use

+.B e

+or

+.B f

+conversion, whichever is shorter, with nonsignificant zeros suppressed.

+.TP

+.B %o

+An unsigned octal number (again, an integer).

+.TP

+.B %s

+A character string.

+.TP

+.B %x

+An unsigned hexadecimal number (an integer).

+.TP

+.B %X

+Like

+.BR %x ,

+but using

+.B ABCDEF

+instead of

+.BR abcdef .

+.TP

+.B %%

+A single

+.B %

+character; no argument is converted.

+.PP

+There are optional, additional parameters that may lie between the

+.B %

+and the control letter:

+.TP

+.B \-

+The expression should be left-justified within its field.

+.TP

+.I width

+The field should be padded to this width. If the number has a leading

+zero, then the field will be padded with zeros.

+Otherwise it is padded with blanks.

+.TP

+.BI . prec

+A number indicating the maximum width of strings or digits to the right

+of the decimal point.

+.PP

+The dynamic

+.I width

+and

+.I prec

+capabilities of the \*(AN C

+.B printf()

+routines are supported.

+.B *

+in place of either the

+.B width

+or

+.B prec

+specifications will cause their values to be taken from

+the argument list to

+.B printf

+or

+.BR sprintf() .

+.SS Special File Names

+.PP

+When doing I/O redirection from either

+.B print

+or

+.B printf

+into a file,

+or via

+.B getline

+from a file,

+.I gawk

+recognizes certain special filenames internally. These filenames

+allow access to open file descriptors inherited from

+.IR gawk 's

+parent process (usually the shell).

+Other special filenames provide access information about the running

+.B gawk

+process.

+The filenames are:

+.TP \w'\fB/dev/stdout\fR'u+1n

+.B /dev/pid

+Reading this file returns the process ID of the current process,

+in decimal, terminated with a newline.

+.TP

+.B /dev/ppid

+Reading this file returns the parent process ID of the current process,

+in decimal, terminated with a newline.

+.TP

+.B /dev/pgrpid

+Reading this file returns the process group ID of the current process,

+in decimal, terminated with a newline.

+.TP

+.B /dev/user

+Reading this file returns a single record terminated with a newline.

+The fields are separated with blanks.

+.B $1

+is the value of the

+.IR getuid (2)

+system call,

+.B $2

+is the value of the

+.IR geteuid (2)

+system call,

+.B $3

+is the value of the

+.IR getgid (2)

+system call, and

+.B $4

+is the value of the

+.IR getegid (2)

+system call.

+If there are any additional fields, they are the group IDs returned by

+.IR getgroups (2).

+(Multiple groups may not be supported on all systems.)

+.TP

+.B /dev/stdin

+The standard input.

+.TP

+.B /dev/stdout

+The standard output.

+.TP

+.B /dev/stderr

+The standard error output.

+.TP

+.BI /dev/fd/\^ n

+The file associated with the open file descriptor

+.IR n .

+.PP

+These are particularly useful for error messages. For example:

+.PP

+.RS

+.ft B

+print "You blew it!" > "/dev/stderr"

+.ft R

+.RE

+.PP

+whereas you would otherwise have to use

+.PP

+.RS

+.ft B

+print "You blew it!" | "cat 1>&2"

+.ft R

+.RE

+.PP

+These file names may also be used on the command line to name data files.

+.SS Numeric Functions

+.PP

+AWK has the following pre-defined arithmetic functions:

+.PP

+.TP \w'\fBsrand(\^\fIexpr\^\fB)\fR'u+1n

+.BI atan2( y , " x" )

+returns the arctangent of

+.I y/x

+in radians.

+.TP

+.BI cos( expr )

+returns the cosine in radians.

+.TP

+.BI exp( expr )

+the exponential function.

+.TP

+.BI int( expr )

+truncates to integer.

+.TP

+.BI log( expr )

+the natural logarithm function.

+.TP

+.B rand()

+returns a random number between 0 and 1.

+.TP

+.BI sin( expr )

+returns the sine in radians.

+.TP

+.BI sqrt( expr )

+the square root function.

+.TP

+.BI srand( expr )

+use

+.I expr

+as a new seed for the random number generator. If no

+.I expr

+is provided, the time of day will be used.

+The return value is the previous seed for the random

+number generator.

+.SS String Functions

+.PP

+AWK has the following pre-defined string functions:

+.PP

+.TP "\w'\fBsprintf(\^\fIfmt\fB\^, \fIexpr-list\^\fB)\fR'u+1n"

+\fBgsub(\fIr\fB, \fIs\fB, \fIt\fB)\fR

+for each substring matching the regular expression

+.I r

+in the string

+.IR t ,

+substitute the string

+.IR s ,

+and return the number of substitutions.

+If

+.I t

+is not supplied, use

+.BR $0 .

+.TP

+.BI index( s , " t" )

+returns the index of the string

+.I t

+in the string

+.IR s ,

+or 0 if

+.I t

+is not present.

+.TP

+.BI length( s )

+returns the length of the string

+.IR s ,

+or the length of

+.B $0

+if

+.I s

+is not supplied.

+.TP

+.BI match( s , " r" )

+returns the position in

+.I s

+where the regular expression

+.I r

+occurs, or 0 if

+.I r

+is not present, and sets the values of

+.B RSTART

+and

+.BR RLENGTH .

+.TP

+\fBsplit(\fIs\fB, \fIa\fB, \fIr\fB)\fR

+splits the string

+.I s

+into the array

+.I a

+on the regular expression

+.IR r ,

+and returns the number of fields. If

+.I r

+is omitted,

+.B FS

+is used instead.

+.TP

+.BI sprintf( fmt , " expr-list" )

+prints

+.I expr-list

+according to

+.IR fmt ,

+and returns the resulting string.

+.TP

+\fBsub(\fIr\fB, \fIs\fB, \fIt\fB)\fR

+just like

+.BR gsub() ,

+but only the first matching substring is replaced.

+.TP

+\fBsubstr(\fIs\fB, \fIi\fB, \fIn\fB)\fR

+returns the

+.IR n -character

+substring of

+.I s

+starting at

+.IR i .

+If

+.I n

+is omitted, the rest of

+.I s

+is used.

+.TP

+.BI tolower( str )

+returns a copy of the string

+.IR str ,

+with all the upper-case characters in

+.I str

+translated to their corresponding lower-case counterparts.

+Non-alphabetic characters are left unchanged.

+.TP

+.BI toupper( str )

+returns a copy of the string

+.IR str ,

+with all the lower-case characters in

+.I str

+translated to their corresponding upper-case counterparts.

+Non-alphabetic characters are left unchanged.

+.SS Time Functions

+.PP

+Since one of the primary uses of AWK programs is processing log files

+that contain time stamp information,

+.I gawk

+provides the following two functions for obtaining time stamps and

+formatting them.

+.PP

+.TP "\w'\fBsystime()\fR'u+1n"

+.B systime()

+returns the current time of day as the number of seconds since the Epoch

+(Midnight UTC, January 1, 1970 on \*(PX systems).

+.TP

+\fBstrftime(\fIformat\fR, \fItimestamp\fB)\fR

+formats

+.I timestamp

+according to the specification in

+.IR format.

+The

+.I timestamp

+should be of the same form as returned by

+.BR systime() .

+If

+.I timestamp

+is missing, the current time of day is used.

+See the specification for the

+.B strftime()

+function in \*(AN C for the format conversions that are

+guaranteed to be available.

+A public-domain version of

+.IR strftime (3)

+and a man page for it are shipped with

+.IR gawk ;

+if that version was used to build

+.IR gawk ,

+then all of the conversions described in that man page are available to

+.IR gawk.

+.SS String Constants

+.PP

+String constants in AWK are sequences of characters enclosed

+between double quotes (\fB"\fR). Within strings, certain

+.I "escape sequences"

+are recognized, as in C. These are:

+.PP

+.TP \w'\fB\e\^\fIddd\fR'u+1n

+.B \e\e

+A literal backslash.

+.TP

+.B \ea

+The ``alert'' character; usually the \s-1ASCII\s+1 \s-1BEL\s+1 character.

+.TP

+.B \eb

+backspace.

+.TP

+.B \ef

+form-feed.

+.TP

+.B \en

+new line.

+.TP

+.B \er

+carriage return.

+.TP

+.B \et

+horizontal tab.

+.TP

+.B \ev

+vertical tab.

+.TP

+.BI \ex "\^hex digits"

+The character represented by the string of hexadecimal digits following

+the

+.BR \ex .

+As in \*(AN C, all following hexadecimal digits are considered part of

+the escape sequence.

+(This feature should tell us something about language design by committee.)

+E.g., "\ex1B" is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character.

+.TP

+.BI \e ddd

+The character represented by the 1-, 2-, or 3-digit sequence of octal

+digits. E.g. "\e033" is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character.

+.TP

+.BI \e c

+The literal character

+.IR c\^ .

+.PP

+The escape sequences may also be used inside constant regular expressions

+(e.g.,

+.B "/[\ \et\ef\en\er\ev]/"

+matches whitespace characters).

+.SH FUNCTIONS

+Functions in AWK are defined as follows:

+.PP

+.RS

+\fBfunction \fIname\fB(\fIparameter list\fB) { \fIstatements \fB}\fR

+.RE

+.PP

+Functions are executed when called from within the action parts of regular

+pattern-action statements. Actual parameters supplied in the function

+call are used to instantiate the formal parameters declared in the function.

+Arrays are passed by reference, other variables are passed by value.

+.PP

+Since functions were not originally part of the AWK language, the provision

+for local variables is rather clumsy: They are declared as extra parameters

+in the parameter list. The convention is to separate local variables from

+real parameters by extra spaces in the parameter list. For example:

+.PP

+.RS

+.ft B

+.nf

+function f(p, q, a, b) { # a & b are local

+ ..... }

+/abc/ { ... ; f(1, 2) ; ... }

+.fi

+.ft R

+.RE

+.PP

+The left parenthesis in a function call is required

+to immediately follow the function name,

+without any intervening white space.

+This is to avoid a syntactic ambiguity with the concatenation operator.

+This restriction does not apply to the built-in functions listed above.

+.PP

+Functions may call each other and may be recursive.

+Function parameters used as local variables are initialized

+to the null string and the number zero upon function invocation.

+.PP

+The word

+.B func

+may be used in place of

+.BR function .

+.SH EXAMPLES

+.nf

+Print and sort the login names of all users:

+.ft B

+ BEGIN { FS = ":" }

+ { print $1 | "sort" }

+.ft R

+Count lines in a file:

+.ft B

+ { nlines++ }

+ END { print nlines }

+.ft R

+Precede each line by its number in the file:

+.ft B

+ { print FNR, $0 }

+.ft R

+Concatenate and line number (a variation on a theme):

+.ft B

+ { print NR, $0 }

+.ft R

+.fi

+.SH SEE ALSO

+.IR egrep (1)

+.PP

+.IR "The AWK Programming Language" ,

+Alfred V. Aho, Brian W. Kernighan, Peter J. Weinberger,

+Addison-Wesley, 1988. ISBN 0-201-07981-X.

+.PP

+.IR "The GAWK Manual" ,

+Edition 0.15, published by the Free Software Foundation, 1993.

+.SH POSIX COMPATIBILITY

+A primary goal for

+.I gawk

+is compatibility with the \*(PX standard, as well as with the

+latest version of \*(UX

+.IR awk .

+To this end,

+.I gawk

+incorporates the following user visible

+features which are not described in the AWK book,

+but are part of

+.I awk

+in System V Release 4, and are in the \*(PX standard.

+.PP

+The

+.B \-v

+option for assigning variables before program execution starts is new.

+The book indicates that command line variable assignment happens when

+.I awk

+would otherwise open the argument as a file, which is after the

+.B BEGIN

+block is executed. However, in earlier implementations, when such an

+assignment appeared before any file names, the assignment would happen

+.I before

+the

+.B BEGIN

+block was run. Applications came to depend on this ``feature.''

+When

+.I awk

+was changed to match its documentation, this option was added to

+accomodate applications that depended upon the old behavior.

+(This feature was agreed upon by both the AT&T and GNU developers.)

+.PP

+The

+.B \-W

+option for implementation specific features is from the \*(PX standard.

+.PP

+When processing arguments,

+.I gawk

+uses the special option ``\fB\-\^\-\fP'' to signal the end of

+arguments, and warns about, but otherwise ignores, undefined options.

+.PP

+The AWK book does not define the return value of

+.BR srand() .

+The System V Release 4 version of \*(UX

+.I awk

+(and the \*(PX standard)

+has it return the seed it was using, to allow keeping track

+of random number sequences. Therefore

+.B srand()

+in

+.I gawk

+also returns its current seed.

+.PP

+Other new features are:

+The use of multiple

+.B \-f

+options (from MKS

+.IR awk );

+the

+.B ENVIRON

+array; the

+.BR \ea ,

+and

+.BR \ev

+escape sequences (done originally in

+.I gawk

+and fed back into AT&T's); the

+.B tolower()

+and

+.B toupper()

+built-in functions (from AT&T); and the \*(AN C conversion specifications in

+.B printf

+(done first in AT&T's version).

+.SH GNU EXTENSIONS

+.I Gawk

+has some extensions to \*(PX

+.IR awk .

+They are described in this section. All the extensions described here

+can be disabled by

+invoking

+.I gawk

+with the

+.B "\-W compat"

+option.

+.PP

+The following features of

+.I gawk

+are not available in

+\*(PX

+.IR awk .

+.RS

+.TP \w'\(bu'u+1n

+\(bu

+The

+.B \ex

+escape sequence.

+.TP

+\(bu

+The

+.B systime()

+and

+.B strftime()

+functions.

+.TP

+\(bu

+The special file names available for I/O redirection are not recognized.

+.TP

+\(bu

+The

+.B ARGIND

+and

+.B ERRNO

+variables are not special.

+.TP

+\(bu

+The

+.B IGNORECASE

+variable and its side-effects are not available.

+.TP

+\(bu

+The

+.B FIELDWIDTHS

+variable and fixed width field splitting.

+.TP

+\(bu

+No path search is performed for files named via the

+.B \-f

+option. Therefore the

+.B AWKPATH

+environment variable is not special.

+.TP

+\(bu

+The use of

+.B "next file"

+to abandon processing of the current input file.

+.RE

+.PP

+The AWK book does not define the return value of the

+.B close()

+function.

+.IR Gawk\^ 's

+.B close()

+returns the value from

+.IR fclose (3),

+or

+.IR pclose (3),

+when closing a file or pipe, respectively.

+.PP

+When

+.I gawk

+is invoked with the

+.B "\-W compat"

+option,

+if the

+.I fs

+argument to the

+.B \-F

+option is ``t'', then

+.B FS

+will be set to the tab character.

+Since this is a rather ugly special case, it is not the default behavior.

+This behavior also does not occur if

+.B \-Wposix

+has been specified.

+.ig

+.PP

+If

+.I gawk

+was compiled for debugging, it will

+accept the following additional options:

+.TP

+.PD 0

+.B \-Wparsedebug

+.TP

+.PD

+.B \-\^\-parsedebug

+Turn on

+.IR yacc (1)

+or

+.IR bison (1)

+debugging output during program parsing.

+This option should only be of interest to the

+.I gawk

+maintainers, and may not even be compiled into

+.IR gawk .

+..

+.SH HISTORICAL FEATURES

+There are two features of historical AWK implementations that

+.I gawk

+supports.

+First, it is possible to call the

+.B length()

+built-in function not only with no argument, but even without parentheses!

+Thus,

+.RS

+.PP

+.ft B

+a = length

+.ft R

+.RE

+.PP

+is the same as either of

+.RS

+.PP

+.ft B

+a = length()

+.br

+a = length($0)

+.ft R

+.RE

+.PP

+This feature is marked as ``deprecated'' in the \*(PX standard, and

+.I gawk

+will issue a warning about its use if

+.B \-Wlint

+is specified on the command line.

+.PP

+The other feature is the use of the

+.B continue

+statement outside the body of a

+.BR while ,

+.BR for ,

+or

+.B do

+loop. Traditional AWK implementations have treated such usage as

+equivalent to the

+.B next

+statement.

+.I Gawk

+will support this usage if

+.B \-Wposix

+has not been specified.

+.SH BUGS

+The

+.B \-F

+option is not necessary given the command line variable assignment feature;

+it remains only for backwards compatibility.

+.PP

+If your system actually has support for

+.B /dev/fd

+and the associated

+.BR /dev/stdin ,

+.BR /dev/stdout ,

+and

+.B /dev/stderr

+files, you may get different output from

+.I gawk

+than you would get on a system without those files. When

+.I gawk

+interprets these files internally, it synchronizes output to the standard

+output with output to

+.BR /dev/stdout ,

+while on a system with those files, the output is actually to different

+open files.

+Caveat Emptor.

+.SH VERSION INFORMATION

+This man page documents

+.IR gawk ,

+version 2.15.

+.PP

+Starting with the 2.15 version of

+.IR gawk ,

+the

+.BR \-c ,

+.BR \-V ,

+.BR \-C ,

+.ig

+.BR \-D ,

+..

+.BR \-a ,

+and

+.B \-e

+options of the 2.11 version are no longer recognized.

+.SH AUTHORS

+The original version of \*(UX

+.I awk

+was designed and implemented by Alfred Aho,

+Peter Weinberger, and Brian Kernighan of AT&T Bell Labs. Brian Kernighan

+continues to maintain and enhance it.

+.PP

+Paul Rubin and Jay Fenlason,

+of the Free Software Foundation, wrote

+.IR gawk ,

+to be compatible with the original version of

+.I awk

+distributed in Seventh Edition \*(UX.

+John Woods contributed a number of bug fixes.

+David Trueman, with contributions

+from Arnold Robbins, made

+.I gawk

+compatible with the new version of \*(UX

+.IR awk .

+.PP

+The initial DOS port was done by Conrad Kwok and Scott Garfinkle.

+Scott Deifik is the current DOS maintainer. Pat Rankin did the

+port to VMS, and Michal Jaegermann did the port to the Atari ST.

+.SH ACKNOWLEDGEMENTS

+Brian Kernighan of Bell Labs

+provided valuable assistance during testing and debugging.

+We thank him.

diff --git a/gnu/usr.bin/awk/awk.h b/gnu/usr.bin/awk/awk.h
new file mode 100644
index 000000000000..ca3997f11d4b
--- /dev/null
+++ b/gnu/usr.bin/awk/awk.h

@@ -0,0 +1,763 @@

+/*

+ * awk.h -- Definitions for gawk.

+ */

+/*

+ *

+ * This file is part of GAWK, the GNU implementation of the

+ * AWK Progamming Language.

+ *

+ * GAWK is free software; you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation; either version 2 of the License, or

+ * (at your option) any later version.

+ *

+ * GAWK is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GAWK; see the file COPYING. If not, write to

+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+ */

+/* ------------------------------ Includes ------------------------------ */

+#include <stdio.h>

+#include <limits.h>

+#include <ctype.h>

+#include <setjmp.h>

+#include <varargs.h>

+#include <time.h>

+#include <errno.h>

+#if !defined(errno) && !defined(MSDOS)

+extern int errno;

+#endif

+#ifdef __GNU_LIBRARY__

+#ifndef linux

+#include <signum.h>

+#endif

+/* ----------------- System dependencies (with more includes) -----------*/

+#if !defined(VMS) || (!defined(VAXC) && !defined(__DECC))

+#include <sys/types.h>

+#include <sys/stat.h>

+#else /* VMS w/ VAXC or DECC */

+#include <types.h>

+#include <stat.h>

+#include <file.h> /* avoid <fcntl.h> in io.c */

+#endif

+#include <signal.h>

+#include "config.h"

+#ifdef __STDC__

+#define P(s) s

+#define MALLOC_ARG_T size_t

+#else

+#define P(s) ()

+#define MALLOC_ARG_T unsigned

+#define volatile

+#define const

+#endif

+#ifndef SIGTYPE

+#define SIGTYPE void

+#endif

+#ifdef SIZE_T_MISSING

+typedef unsigned int size_t;

+#endif

+#ifndef SZTC

+#define SZTC

+#define INTC

+#endif

+#ifdef STDC_HEADERS

+#include <stdlib.h>

+#include <string.h>

+#ifdef NeXT

+#include <libc.h>

+#undef atof

+#else

+#if defined(atarist) || defined(VMS)

+#include <unixlib.h>

+#else /* atarist || VMS */

+#ifndef MSDOS

+#include <unistd.h>

+#endif /* MSDOS */

+#endif /* atarist || VMS */

+#endif /* Next */

+#else /* STDC_HEADERS */

+#include "protos.h"

+#endif /* STDC_HEADERS */

+#if defined(ultrix) && !defined(Ultrix41)

+extern char * getenv P((char *name));

+extern double atof P((char *s));

+#endif

+#ifndef __GNUC__

+#ifdef sparc

+/* nasty nasty SunOS-ism */

+#include <alloca.h>

+#ifdef lint

+extern char *alloca();

+#endif

+#else /* not sparc */

+#if !defined(alloca) && !defined(ALLOCA_PROTO)

+extern char *alloca();

+#endif

+#endif /* sparc */

+#endif /* __GNUC__ */

+#ifdef HAVE_UNDERSCORE_SETJMP

+/* nasty nasty berkelixm */

+#define setjmp _setjmp

+#define longjmp _longjmp

+#endif

+/*

+ * if you don't have vprintf, try this and cross your fingers.

+ */

+#if defined(VPRINTF_MISSING)

+#define vfprintf(fp,fmt,arg) _doprnt((fmt), (arg), (fp))

+#endif

+#ifdef VMS

+/* some macros to redirect to code in vms/vms_misc.c */

+#define exit vms_exit

+#define open vms_open

+#define strerror vms_strerror

+#define strdup vms_strdup

+extern void exit P((int));

+extern int open P((const char *,int,...));

+extern char *strerror P((int));

+extern char *strdup P((const char *str));

+extern int vms_devopen P((const char *,int));

+# ifndef NO_TTY_FWRITE

+#define fwrite tty_fwrite

+#define fclose tty_fclose

+extern size_t fwrite P((const void *,size_t,size_t,FILE *));

+extern int fclose P((FILE *));

+# endif

+extern FILE *popen P((const char *,const char *));

+extern int pclose P((FILE *));

+extern void vms_arg_fixup P((int *,char ***));

+/* some things not in STDC_HEADERS */

+extern int gnu_strftime P((char *,size_t,const char *,const struct tm *));

+extern int unlink P((const char *));

+extern int getopt P((int,char **,char *));

+extern int isatty P((int));

+#ifndef fileno

+extern int fileno P((FILE *));

+#endif

+extern int close(), dup(), dup2(), fstat(), read(), stat();

+#endif /*VMS*/

+#ifdef MSDOS

+#include <io.h>

+extern FILE *popen P((char *, char *));

+extern int pclose P((FILE *));

+#endif

+#define GNU_REGEX

+#ifdef GNU_REGEX

+#include "regex.h"

+#include "dfa.h"

+typedef struct Regexp {

+ struct re_pattern_buffer pat;

+ struct re_registers regs;

+ struct regexp dfareg;

+ int dfa;

+} Regexp;

+#define RESTART(rp,s) (rp)->regs.start[0]

+#define REEND(rp,s) (rp)->regs.end[0]

+#else /* GNU_REGEX */

+#endif /* GNU_REGEX */

+#ifdef atarist

+#define read _text_read /* we do not want all these CR's to mess our input */

+extern int _text_read (int, char *, int);

+#endif

+#ifndef DEFPATH

+#define DEFPATH ".:/usr/local/lib/awk:/usr/lib/awk"

+#endif

+#ifndef ENVSEP

+#define ENVSEP ':'

+#endif

+/* ------------------ Constants, Structures, Typedefs ------------------ */

+#define AWKNUM double

+typedef enum {

+ /* illegal entry == 0 */

+ Node_illegal,

+ /* binary operators lnode and rnode are the expressions to work on */

+ Node_times,

+ Node_quotient,

+ Node_mod,

+ Node_plus,

+ Node_minus,

+ Node_cond_pair, /* conditional pair (see Node_line_range) */

+ Node_subscript,

+ Node_concat,

+ Node_exp,

+ /* unary operators subnode is the expression to work on */

+/*10*/ Node_preincrement,

+ Node_predecrement,

+ Node_postincrement,

+ Node_postdecrement,

+ Node_unary_minus,

+ Node_field_spec,

+ /* assignments lnode is the var to assign to, rnode is the exp */

+ Node_assign,

+ Node_assign_times,

+ Node_assign_quotient,

+ Node_assign_mod,

+/*20*/ Node_assign_plus,

+ Node_assign_minus,

+ Node_assign_exp,

+ /* boolean binaries lnode and rnode are expressions */

+ Node_and,

+ Node_or,

+ /* binary relationals compares lnode and rnode */

+ Node_equal,

+ Node_notequal,

+ Node_less,

+ Node_greater,

+ Node_leq,

+/*30*/ Node_geq,

+ Node_match,

+ Node_nomatch,

+ /* unary relationals works on subnode */

+ Node_not,

+ /* program structures */

+ Node_rule_list, /* lnode is a rule, rnode is rest of list */

+ Node_rule_node, /* lnode is pattern, rnode is statement */

+ Node_statement_list, /* lnode is statement, rnode is more list */

+ Node_if_branches, /* lnode is to run on true, rnode on false */

+ Node_expression_list, /* lnode is an exp, rnode is more list */

+ Node_param_list, /* lnode is a variable, rnode is more list */

+ /* keywords */

+/*40*/ Node_K_if, /* lnode is conditonal, rnode is if_branches */

+ Node_K_while, /* lnode is condtional, rnode is stuff to run */

+ Node_K_for, /* lnode is for_struct, rnode is stuff to run */

+ Node_K_arrayfor, /* lnode is for_struct, rnode is stuff to run */

+ Node_K_break, /* no subs */

+ Node_K_continue, /* no stuff */

+ Node_K_print, /* lnode is exp_list, rnode is redirect */

+ Node_K_printf, /* lnode is exp_list, rnode is redirect */

+ Node_K_next, /* no subs */

+ Node_K_exit, /* subnode is return value, or NULL */

+/*50*/ Node_K_do, /* lnode is conditional, rnode stuff to run */

+ Node_K_return,

+ Node_K_delete,

+ Node_K_getline,

+ Node_K_function, /* lnode is statement list, rnode is params */

+ /* I/O redirection for print statements */

+ Node_redirect_output, /* subnode is where to redirect */

+ Node_redirect_append, /* subnode is where to redirect */

+ Node_redirect_pipe, /* subnode is where to redirect */

+ Node_redirect_pipein, /* subnode is where to redirect */

+ Node_redirect_input, /* subnode is where to redirect */

+ /* Variables */

+/*60*/ Node_var, /* rnode is value, lnode is array stuff */

+ Node_var_array, /* array is ptr to elements, asize num of

+ * eles */

+ Node_val, /* node is a value - type in flags */

+ /* Builtins subnode is explist to work on, proc is func to call */

+ Node_builtin,

+ /*

+ * pattern: conditional ',' conditional ; lnode of Node_line_range

+ * is the two conditionals (Node_cond_pair), other word (rnode place)

+ * is a flag indicating whether or not this range has been entered.

+ */

+ Node_line_range,

+ /*

+ * boolean test of membership in array lnode is string-valued

+ * expression rnode is array name

+ */

+ Node_in_array,

+ Node_func, /* lnode is param. list, rnode is body */

+ Node_func_call, /* lnode is name, rnode is argument list */

+ Node_cond_exp, /* lnode is conditonal, rnode is if_branches */

+ Node_regex,

+/*70*/ Node_hashnode,

+ Node_ahash,

+ Node_NF,

+ Node_NR,

+ Node_FNR,

+ Node_FS,

+ Node_RS,

+ Node_FIELDWIDTHS,

+ Node_IGNORECASE,

+ Node_OFS,

+ Node_ORS,

+ Node_OFMT,

+ Node_CONVFMT,

+ Node_K_nextfile

+} NODETYPE;

+/*

+ * NOTE - this struct is a rather kludgey -- it is packed to minimize

+ * space usage, at the expense of cleanliness. Alter at own risk.

+ */

+typedef struct exp_node {

+ union {

+ struct {

+ union {

+ struct exp_node *lptr;

+ char *param_name;

+ } l;

+ union {

+ struct exp_node *rptr;

+ struct exp_node *(*pptr) ();

+ Regexp *preg;

+ struct for_loop_header *hd;

+ struct exp_node **av;

+ int r_ent; /* range entered */

+ } r;

+ union {

+ char *name;

+ struct exp_node *extra;

+ } x;

+ short number;

+ unsigned char reflags;

+# define CASE 1

+# define CONST 2

+# define FS_DFLT 4

+ } nodep;

+ struct {

+ AWKNUM fltnum; /* this is here for optimal packing of

+ * the structure on many machines

+ */

+ char *sp;

+ size_t slen;

+ unsigned char sref;

+ char idx;

+ } val;

+ struct {

+ struct exp_node *next;

+ char *name;

+ int length;

+ struct exp_node *value;

+ } hash;

+#define hnext sub.hash.next

+#define hname sub.hash.name

+#define hlength sub.hash.length

+#define hvalue sub.hash.value

+ struct {

+ struct exp_node *next;

+ struct exp_node *name;

+ struct exp_node *value;

+ } ahash;

+#define ahnext sub.ahash.next

+#define ahname sub.ahash.name

+#define ahvalue sub.ahash.value

+ } sub;

+ NODETYPE type;

+ unsigned short flags;

+# define MALLOC 1 /* can be free'd */

+# define TEMP 2 /* should be free'd */

+# define PERM 4 /* can't be free'd */

+# define STRING 8 /* assigned as string */

+# define STR 16 /* string value is current */

+# define NUM 32 /* numeric value is current */

+# define NUMBER 64 /* assigned as number */

+# define MAYBE_NUM 128 /* user input: if NUMERIC then

+ * a NUMBER

+ */

+ char *vname; /* variable's name */

+} NODE;

+#define lnode sub.nodep.l.lptr

+#define nextp sub.nodep.l.lptr

+#define rnode sub.nodep.r.rptr

+#define source_file sub.nodep.x.name

+#define source_line sub.nodep.number

+#define param_cnt sub.nodep.number

+#define param sub.nodep.l.param_name

+#define subnode lnode

+#define proc sub.nodep.r.pptr

+#define re_reg sub.nodep.r.preg

+#define re_flags sub.nodep.reflags

+#define re_text lnode

+#define re_exp sub.nodep.x.extra

+#define re_cnt sub.nodep.number

+#define forsub lnode

+#define forloop rnode->sub.nodep.r.hd

+#define stptr sub.val.sp

+#define stlen sub.val.slen

+#define stref sub.val.sref

+#define stfmt sub.val.idx

+#define numbr sub.val.fltnum

+#define var_value lnode

+#define var_array sub.nodep.r.av

+#define condpair lnode

+#define triggered sub.nodep.r.r_ent

+#ifdef DONTDEF

+int primes[] = {31, 61, 127, 257, 509, 1021, 2053, 4099, 8191, 16381};

+#endif

+/* a quick profile suggests that the following is a good value */

+#define HASHSIZE 127

+typedef struct for_loop_header {

+ NODE *init;

+ NODE *cond;

+ NODE *incr;

+} FOR_LOOP_HEADER;

+/* for "for(iggy in foo) {" */

+struct search {

+ NODE **arr_ptr;

+ NODE **arr_end;

+ NODE *bucket;

+ NODE *retval;

+};

+/* for faster input, bypass stdio */

+typedef struct iobuf {

+ int fd;

+ char *buf;

+ char *off;

+ char *end;

+ size_t size; /* this will be determined by an fstat() call */

+ int cnt;

+ long secsiz;

+ int flag;

+# define IOP_IS_TTY 1

+# define IOP_IS_INTERNAL 2

+# define IOP_NO_FREE 4

+} IOBUF;

+typedef void (*Func_ptr)();

+/*

+ * structure used to dynamically maintain a linked-list of open files/pipes

+ */

+struct redirect {

+ unsigned int flag;

+# define RED_FILE 1

+# define RED_PIPE 2

+# define RED_READ 4

+# define RED_WRITE 8

+# define RED_APPEND 16

+# define RED_NOBUF 32

+# define RED_USED 64

+# define RED_EOF 128

+ char *value;

+ FILE *fp;

+ IOBUF *iop;

+ int pid;

+ int status;

+ struct redirect *prev;

+ struct redirect *next;

+};

+/* structure for our source, either a command line string or a source file */

+struct src {

+ enum srctype { CMDLINE = 1, SOURCEFILE } stype;

+ char *val;

+};

+/* longjmp return codes, must be nonzero */

+/* Continue means either for loop/while continue, or next input record */

+#define TAG_CONTINUE 1

+/* Break means either for/while break, or stop reading input */

+#define TAG_BREAK 2

+/* Return means return from a function call; leave value in ret_node */

+#define TAG_RETURN 3

+#define HUGE INT_MAX

+/* -------------------------- External variables -------------------------- */

+/* gawk builtin variables */

+extern int NF;

+extern int NR;

+extern int FNR;

+extern int IGNORECASE;

+extern char *RS;

+extern char *OFS;

+extern int OFSlen;

+extern char *ORS;

+extern int ORSlen;

+extern char *OFMT;

+extern char *CONVFMT;

+extern int CONVFMTidx;

+extern int OFMTidx;

+extern NODE *FS_node, *NF_node, *RS_node, *NR_node;

+extern NODE *FILENAME_node, *OFS_node, *ORS_node, *OFMT_node;

+extern NODE *CONVFMT_node;

+extern NODE *FNR_node, *RLENGTH_node, *RSTART_node, *SUBSEP_node;

+extern NODE *IGNORECASE_node;

+extern NODE *FIELDWIDTHS_node;

+extern NODE **stack_ptr;

+extern NODE *Nnull_string;

+extern NODE **fields_arr;

+extern int sourceline;

+extern char *source;

+extern NODE *expression_value;

+extern NODE *_t; /* used as temporary in tree_eval */

+extern const char *myname;

+extern NODE *nextfree;

+extern int field0_valid;

+extern int do_unix;

+extern int do_posix;

+extern int do_lint;

+extern int in_begin_rule;

+extern int in_end_rule;

+/* ------------------------- Pseudo-functions ------------------------- */

+#define is_identchar(c) (isalnum(c) || (c) == '_')

+#ifndef MPROF

+#define getnode(n) if (nextfree) n = nextfree, nextfree = nextfree->nextp;\

+ else n = more_nodes()

+#define freenode(n) ((n)->nextp = nextfree, nextfree = (n))

+#else

+#define getnode(n) emalloc(n, NODE *, sizeof(NODE), "getnode")

+#define freenode(n) free(n)

+#endif

+#ifdef DEBUG

+#define tree_eval(t) r_tree_eval(t)

+#else

+#define tree_eval(t) (_t = (t),(_t) == NULL ? Nnull_string : \

+ ((_t)->type == Node_val ? (_t) : \

+ ((_t)->type == Node_var ? (_t)->var_value : \

+ ((_t)->type == Node_param_list ? \

+ (stack_ptr[(_t)->param_cnt])->var_value : \

+ r_tree_eval((_t))))))

+#endif

+#define make_number(x) mk_number((x), (MALLOC|NUM|NUMBER))

+#define tmp_number(x) mk_number((x), (MALLOC|TEMP|NUM|NUMBER))

+#define free_temp(n) do {if ((n)->flags&TEMP) { unref(n); }} while (0)

+#define make_string(s,l) make_str_node((s), SZTC (l),0)

+#define SCAN 1

+#define ALREADY_MALLOCED 2

+#define cant_happen() fatal("internal error line %d, file: %s", \

+ __LINE__, __FILE__);

+#if defined(__STDC__) && !defined(NO_TOKEN_PASTING)

+#define emalloc(var,ty,x,str) (void)((var=(ty)malloc((MALLOC_ARG_T)(x))) ||\

+ (fatal("%s: %s: can't allocate memory (%s)",\

+ (str), #var, strerror(errno)),0))

+#define erealloc(var,ty,x,str) (void)((var=(ty)realloc((char *)var,\

+ (MALLOC_ARG_T)(x))) ||\

+ (fatal("%s: %s: can't allocate memory (%s)",\

+ (str), #var, strerror(errno)),0))

+#else /* __STDC__ */

+#define emalloc(var,ty,x,str) (void)((var=(ty)malloc((MALLOC_ARG_T)(x))) ||\

+ (fatal("%s: %s: can't allocate memory (%s)",\

+ (str), "var", strerror(errno)),0))

+#define erealloc(var,ty,x,str) (void)((var=(ty)realloc((char *)var,\

+ (MALLOC_ARG_T)(x))) ||\

+ (fatal("%s: %s: can't allocate memory (%s)",\

+ (str), "var", strerror(errno)),0))

+#endif /* __STDC__ */

+#ifdef DEBUG

+#define force_number r_force_number

+#define force_string r_force_string

+#else /* not DEBUG */

+#ifdef lint

+extern AWKNUM force_number();

+#endif

+#ifdef MSDOS

+extern double _msc51bug;

+#define force_number(n) (_msc51bug=(_t = (n),(_t->flags & NUM) ? _t->numbr : r_force_number(_t)))

+#else /* not MSDOS */

+#define force_number(n) (_t = (n),(_t->flags & NUM) ? _t->numbr : r_force_number(_t))

+#endif /* MSDOS */

+#define force_string(s) (_t = (s),(_t->flags & STR) ? _t : r_force_string(_t))

+#endif /* not DEBUG */

+#define STREQ(a,b) (*(a) == *(b) && strcmp((a), (b)) == 0)

+#define STREQN(a,b,n) ((n)&& *(a)== *(b) && strncmp((a), (b), SZTC (n)) == 0)

+/* ------------- Function prototypes or defs (as appropriate) ------------- */

+/* array.c */

+extern NODE *concat_exp P((NODE *tree));

+extern void assoc_clear P((NODE *symbol));

+extern unsigned int hash P((char *s, int len));

+extern int in_array P((NODE *symbol, NODE *subs));

+extern NODE **assoc_lookup P((NODE *symbol, NODE *subs));

+extern void do_delete P((NODE *symbol, NODE *tree));

+extern void assoc_scan P((NODE *symbol, struct search *lookat));

+extern void assoc_next P((struct search *lookat));

+/* awk.tab.c */

+extern char *tokexpand P((void));

+extern char nextc P((void));

+extern NODE *node P((NODE *left, NODETYPE op, NODE *right));

+extern NODE *install P((char *name, NODE *value));

+extern NODE *lookup P((char *name));

+extern NODE *variable P((char *name, int can_free));

+extern int yyparse P((void));

+/* builtin.c */

+extern NODE *do_exp P((NODE *tree));

+extern NODE *do_index P((NODE *tree));

+extern NODE *do_int P((NODE *tree));

+extern NODE *do_length P((NODE *tree));

+extern NODE *do_log P((NODE *tree));

+extern NODE *do_sprintf P((NODE *tree));

+extern void do_printf P((NODE *tree));

+extern void print_simple P((NODE *tree, FILE *fp));

+extern NODE *do_sqrt P((NODE *tree));

+extern NODE *do_substr P((NODE *tree));

+extern NODE *do_strftime P((NODE *tree));

+extern NODE *do_systime P((NODE *tree));

+extern NODE *do_system P((NODE *tree));

+extern void do_print P((NODE *tree));

+extern NODE *do_tolower P((NODE *tree));

+extern NODE *do_toupper P((NODE *tree));

+extern NODE *do_atan2 P((NODE *tree));

+extern NODE *do_sin P((NODE *tree));

+extern NODE *do_cos P((NODE *tree));

+extern NODE *do_rand P((NODE *tree));

+extern NODE *do_srand P((NODE *tree));

+extern NODE *do_match P((NODE *tree));

+extern NODE *do_gsub P((NODE *tree));

+extern NODE *do_sub P((NODE *tree));

+/* eval.c */

+extern int interpret P((NODE *volatile tree));

+extern NODE *r_tree_eval P((NODE *tree));

+extern int cmp_nodes P((NODE *t1, NODE *t2));

+extern NODE **get_lhs P((NODE *ptr, Func_ptr *assign));

+extern void set_IGNORECASE P((void));

+void set_OFS P((void));

+void set_ORS P((void));

+void set_OFMT P((void));

+void set_CONVFMT P((void));

+/* field.c */

+extern void init_fields P((void));

+extern void set_record P((char *buf, int cnt, int freeold));

+extern void reset_record P((void));

+extern void set_NF P((void));

+extern NODE **get_field P((int num, Func_ptr *assign));

+extern NODE *do_split P((NODE *tree));

+extern void set_FS P((void));

+extern void set_RS P((void));

+extern void set_FIELDWIDTHS P((void));

+/* io.c */

+extern void set_FNR P((void));

+extern void set_NR P((void));

+extern void do_input P((void));

+extern struct redirect *redirect P((NODE *tree, int *errflg));

+extern NODE *do_close P((NODE *tree));

+extern int flush_io P((void));

+extern int close_io P((void));

+extern int devopen P((char *name, char *mode));

+extern int pathopen P((char *file));

+extern NODE *do_getline P((NODE *tree));

+extern void do_nextfile P((void));

+/* iop.c */

+extern int optimal_bufsize P((int fd));

+extern IOBUF *iop_alloc P((int fd));

+extern int get_a_record P((char **out, IOBUF *iop, int rs, int *errcode));

+/* main.c */

+extern int main P((int argc, char **argv));

+extern Regexp *mk_re_parse P((char *s, int ignorecase));

+extern void load_environ P((void));

+extern char *arg_assign P((char *arg));

+extern SIGTYPE catchsig P((int sig, int code));

+/* msg.c */

+#ifdef MSDOS

+extern void err P((char *s, char *emsg, char *va_list, ...));

+extern void msg P((char *va_alist, ...));

+extern void warning P((char *va_alist, ...));

+extern void fatal P((char *va_alist, ...));

+#else

+extern void err ();

+extern void msg ();

+extern void warning ();

+extern void fatal ();

+#endif

+/* node.c */

+extern AWKNUM r_force_number P((NODE *n));

+extern NODE *r_force_string P((NODE *s));

+extern NODE *dupnode P((NODE *n));

+extern NODE *mk_number P((AWKNUM x, unsigned int flags));

+extern NODE *make_str_node P((char *s, size_t len, int scan ));

+extern NODE *tmp_string P((char *s, size_t len ));

+extern NODE *more_nodes P((void));

+#ifdef DEBUG

+extern void freenode P((NODE *it));

+#endif

+extern void unref P((NODE *tmp));

+extern int parse_escape P((char **string_ptr));

+/* re.c */

+extern Regexp *make_regexp P((char *s, int len, int ignorecase, int dfa));

+extern int research P((Regexp *rp, char *str, int start, int len, int need_start));

+extern void refree P((Regexp *rp));

+extern void reg_error P((const char *s));

+extern Regexp *re_update P((NODE *t));

+extern void resyntax P((int syntax));

+extern void resetup P((void));

+/* strcase.c */

+extern int strcasecmp P((const char *s1, const char *s2));

+extern int strncasecmp P((const char *s1, const char *s2, register size_t n));

+#ifdef atarist

+/* atari/tmpnam.c */

+extern char *tmpnam P((char *buf));

+extern char *tempnam P((const char *path, const char *base));

+#endif

+/* Figure out what '\a' really is. */

+#ifdef __STDC__

+#define BELL '\a' /* sure makes life easy, don't it? */

+#else

+# if 'z' - 'a' == 25 /* ascii */

+# if 'a' != 97 /* machine is dumb enough to use mark parity */

+# define BELL '\207'

+# else

+# define BELL '\07'

+# endif

+# else

+# define BELL '\057'

+# endif

+#endif

+extern char casetable[]; /* for case-independent regexp matching */

diff --git a/gnu/usr.bin/awk/awk.y b/gnu/usr.bin/awk/awk.y
new file mode 100644
index 000000000000..6e87f1c449cc
--- /dev/null
+++ b/gnu/usr.bin/awk/awk.y

@@ -0,0 +1,1804 @@

+/*

+ * awk.y --- yacc/bison parser

+ */

+/*

+ *

+ * This file is part of GAWK, the GNU implementation of the

+ * AWK Progamming Language.

+ *

+ * GAWK is free software; you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation; either version 2 of the License, or

+ * (at your option) any later version.

+ *

+ * GAWK is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GAWK; see the file COPYING. If not, write to

+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+ */

+%{

+#ifdef DEBUG

+#define YYDEBUG 12

+#endif

+#include "awk.h"

+static void yyerror (); /* va_alist */

+static char *get_src_buf P((void));

+static int yylex P((void));

+static NODE *node_common P((NODETYPE op));

+static NODE *snode P((NODE *subn, NODETYPE op, int sindex));

+static NODE *mkrangenode P((NODE *cpair));

+static NODE *make_for_loop P((NODE *init, NODE *cond, NODE *incr));

+static NODE *append_right P((NODE *list, NODE *new));

+static void func_install P((NODE *params, NODE *def));

+static void pop_var P((NODE *np, int freeit));

+static void pop_params P((NODE *params));

+static NODE *make_param P((char *name));

+static NODE *mk_rexp P((NODE *exp));

+static int want_assign; /* lexical scanning kludge */

+static int want_regexp; /* lexical scanning kludge */

+static int can_return; /* lexical scanning kludge */

+static int io_allowed = 1; /* lexical scanning kludge */

+static char *lexptr; /* pointer to next char during parsing */

+static char *lexend;

+static char *lexptr_begin; /* keep track of where we were for error msgs */

+static char *lexeme; /* beginning of lexeme for debugging */

+static char *thisline = NULL;

+#define YYDEBUG_LEXER_TEXT (lexeme)

+static int param_counter;

+static char *tokstart = NULL;

+static char *token = NULL;

+static char *tokend;

+NODE *variables[HASHSIZE];

+extern char *source;

+extern int sourceline;

+extern struct src *srcfiles;

+extern int numfiles;

+extern int errcount;

+extern NODE *begin_block;

+extern NODE *end_block;

+%}

+%union {

+ long lval;

+ AWKNUM fval;

+ NODE *nodeval;

+ NODETYPE nodetypeval;

+ char *sval;

+ NODE *(*ptrval)();

+%type <nodeval> function_prologue function_body

+%type <nodeval> rexp exp start program rule simp_exp

+%type <nodeval> non_post_simp_exp

+%type <nodeval> pattern

+%type <nodeval> action variable param_list

+%type <nodeval> rexpression_list opt_rexpression_list

+%type <nodeval> expression_list opt_expression_list

+%type <nodeval> statements statement if_statement opt_param_list

+%type <nodeval> opt_exp opt_variable regexp

+%type <nodeval> input_redir output_redir

+%type <nodetypeval> print

+%type <sval> func_name

+%type <lval> lex_builtin

+%token <sval> FUNC_CALL NAME REGEXP

+%token <lval> ERROR

+%token <nodeval> YNUMBER YSTRING

+%token <nodetypeval> RELOP APPEND_OP

+%token <nodetypeval> ASSIGNOP MATCHOP NEWLINE CONCAT_OP

+%token <nodetypeval> LEX_BEGIN LEX_END LEX_IF LEX_ELSE LEX_RETURN LEX_DELETE

+%token <nodetypeval> LEX_WHILE LEX_DO LEX_FOR LEX_BREAK LEX_CONTINUE

+%token <nodetypeval> LEX_PRINT LEX_PRINTF LEX_NEXT LEX_EXIT LEX_FUNCTION

+%token <nodetypeval> LEX_GETLINE

+%token <nodetypeval> LEX_IN

+%token <lval> LEX_AND LEX_OR INCREMENT DECREMENT

+%token <lval> LEX_BUILTIN LEX_LENGTH

+/* these are just yylval numbers */

+/* Lowest to highest */

+%right ASSIGNOP

+%right '?' ':'

+%left LEX_OR

+%left LEX_AND

+%left LEX_GETLINE

+%nonassoc LEX_IN

+%left FUNC_CALL LEX_BUILTIN LEX_LENGTH

+%nonassoc MATCHOP

+%nonassoc RELOP '<' '>' '|' APPEND_OP

+%left CONCAT_OP

+%left YSTRING YNUMBER

+%left '+' '-'

+%left '*' '/' '%'

+%right '!' UNARY

+%right '^'

+%left INCREMENT DECREMENT

+%left '$'

+%left '(' ')'

+%%

+start

+ : opt_nls program opt_nls

+ { expression_value = $2; }

+ ;

+program

+ : rule

+ {

+ if ($1 != NULL)

+ $$ = $1;

+ else

+ $$ = NULL;

+ yyerrok;

+ }

+ | program rule

+ /* add the rule to the tail of list */

+ {

+ if ($2 == NULL)

+ $$ = $1;

+ else if ($1 == NULL)

+ $$ = $2;

+ else {

+ if ($1->type != Node_rule_list)

+ $1 = node($1, Node_rule_list,

+ (NODE*)NULL);

+ $$ = append_right ($1,

+ node($2, Node_rule_list,(NODE *) NULL));

+ }

+ yyerrok;

+ }

+ | error { $$ = NULL; }

+ | program error { $$ = NULL; }

+ ;

+rule

+ : LEX_BEGIN { io_allowed = 0; }

+ action

+ {

+ if (begin_block) {

+ if (begin_block->type != Node_rule_list)

+ begin_block = node(begin_block, Node_rule_list,

+ (NODE *)NULL);

+ (void) append_right (begin_block, node(

+ node((NODE *)NULL, Node_rule_node, $3),

+ Node_rule_list, (NODE *)NULL) );

+ } else

+ begin_block = node((NODE *)NULL, Node_rule_node, $3);

+ $$ = NULL;

+ io_allowed = 1;

+ yyerrok;

+ }

+ | LEX_END { io_allowed = 0; }

+ action

+ {

+ if (end_block) {

+ if (end_block->type != Node_rule_list)

+ end_block = node(end_block, Node_rule_list,

+ (NODE *)NULL);

+ (void) append_right (end_block, node(

+ node((NODE *)NULL, Node_rule_node, $3),

+ Node_rule_list, (NODE *)NULL));

+ } else

+ end_block = node((NODE *)NULL, Node_rule_node, $3);

+ $$ = NULL;

+ io_allowed = 1;

+ yyerrok;

+ }

+ | LEX_BEGIN statement_term

+ {

+ warning("BEGIN blocks must have an action part");

+ errcount++;

+ yyerrok;

+ }

+ | LEX_END statement_term

+ {

+ warning("END blocks must have an action part");

+ errcount++;

+ yyerrok;

+ }

+ | pattern action

+ { $$ = node ($1, Node_rule_node, $2); yyerrok; }

+ | action

+ { $$ = node ((NODE *)NULL, Node_rule_node, $1); yyerrok; }

+ | pattern statement_term

+ {

+ $$ = node ($1,

+ Node_rule_node,

+ node(node(node(make_number(0.0),

+ Node_field_spec,

+ (NODE *) NULL),

+ Node_expression_list,

+ (NODE *) NULL),

+ Node_K_print,

+ (NODE *) NULL));

+ yyerrok;

+ }

+ | function_prologue function_body

+ {

+ func_install($1, $2);

+ $$ = NULL;

+ yyerrok;

+ }

+ ;

+func_name

+ : NAME

+ { $$ = $1; }

+ | FUNC_CALL

+ { $$ = $1; }

+ | lex_builtin

+ {

+ yyerror("%s() is a built-in function, it cannot be redefined",

+ tokstart);

+ errcount++;

+ /* yyerrok; */

+ }

+ ;

+lex_builtin

+ : LEX_BUILTIN

+ | LEX_LENGTH

+ ;

+function_prologue

+ : LEX_FUNCTION

+ {

+ param_counter = 0;

+ }

+ func_name '(' opt_param_list r_paren opt_nls

+ {

+ $$ = append_right(make_param($3), $5);

+ can_return = 1;

+ }

+ ;

+function_body

+ : l_brace statements r_brace opt_semi

+ {

+ $$ = $2;

+ can_return = 0;

+ }

+ ;

+pattern

+ : exp

+ { $$ = $1; }

+ | exp comma exp

+ { $$ = mkrangenode ( node($1, Node_cond_pair, $3) ); }

+ ;

+regexp

+ /*

+ * In this rule, want_regexp tells yylex that the next thing

+ * is a regexp so it should read up to the closing slash.

+ */

+ : '/'

+ { ++want_regexp; }

+ REGEXP '/'

+ {

+ NODE *n;

+ int len;

+ getnode(n);

+ n->type = Node_regex;

+ len = strlen($3);

+ n->re_exp = make_string($3, len);

+ n->re_reg = make_regexp($3, len, 0, 1);

+ n->re_text = NULL;

+ n->re_flags = CONST;

+ n->re_cnt = 1;

+ $$ = n;

+ }

+ ;

+action

+ : l_brace statements r_brace opt_semi opt_nls

+ { $$ = $2 ; }

+ | l_brace r_brace opt_semi opt_nls

+ { $$ = NULL; }

+ ;

+statements

+ : statement

+ { $$ = $1; }

+ | statements statement

+ {

+ if ($1 == NULL || $1->type != Node_statement_list)

+ $1 = node($1, Node_statement_list,(NODE *)NULL);

+ $$ = append_right($1,

+ node( $2, Node_statement_list, (NODE *)NULL));

+ yyerrok;

+ }

+ | error

+ { $$ = NULL; }

+ | statements error

+ { $$ = NULL; }

+ ;

+statement_term

+ : nls

+ | semi opt_nls

+ ;

+statement

+ : semi opt_nls

+ { $$ = NULL; }

+ | l_brace r_brace

+ { $$ = NULL; }

+ | l_brace statements r_brace

+ { $$ = $2; }

+ | if_statement

+ { $$ = $1; }

+ | LEX_WHILE '(' exp r_paren opt_nls statement

+ { $$ = node ($3, Node_K_while, $6); }

+ | LEX_DO opt_nls statement LEX_WHILE '(' exp r_paren opt_nls

+ { $$ = node ($6, Node_K_do, $3); }

+ | LEX_FOR '(' NAME LEX_IN NAME r_paren opt_nls statement

+ {

+ $$ = node ($8, Node_K_arrayfor, make_for_loop(variable($3,1),

+ (NODE *)NULL, variable($5,1)));

+ }

+ | LEX_FOR '(' opt_exp semi exp semi opt_exp r_paren opt_nls statement

+ {

+ $$ = node($10, Node_K_for, (NODE *)make_for_loop($3, $5, $7));

+ }

+ | LEX_FOR '(' opt_exp semi semi opt_exp r_paren opt_nls statement

+ {

+ $$ = node ($9, Node_K_for,

+ (NODE *)make_for_loop($3, (NODE *)NULL, $6));

+ }

+ | LEX_BREAK statement_term

+ /* for break, maybe we'll have to remember where to break to */

+ { $$ = node ((NODE *)NULL, Node_K_break, (NODE *)NULL); }

+ | LEX_CONTINUE statement_term

+ /* similarly */

+ { $$ = node ((NODE *)NULL, Node_K_continue, (NODE *)NULL); }

+ | print '(' expression_list r_paren output_redir statement_term

+ { $$ = node ($3, $1, $5); }

+ | print opt_rexpression_list output_redir statement_term

+ {

+ if ($1 == Node_K_print && $2 == NULL)

+ $2 = node(node(make_number(0.0),

+ Node_field_spec,

+ (NODE *) NULL),

+ Node_expression_list,

+ (NODE *) NULL);

+ $$ = node ($2, $1, $3);

+ }

+ | LEX_NEXT opt_exp statement_term

+ { NODETYPE type;

+ if ($2 && $2 == lookup("file")) {

+ if (do_lint)

+ warning("`next file' is a gawk extension");

+ else if (do_unix || do_posix)

+ yyerror("`next file' is a gawk extension");

+ else if (! io_allowed)

+ yyerror("`next file' used in BEGIN or END action");

+ type = Node_K_nextfile;

+ } else {

+ if (! io_allowed)

+ yyerror("next used in BEGIN or END action");

+ type = Node_K_next;

+ }

+ $$ = node ((NODE *)NULL, type, (NODE *)NULL);

+ }

+ | LEX_EXIT opt_exp statement_term

+ { $$ = node ($2, Node_K_exit, (NODE *)NULL); }

+ | LEX_RETURN

+ { if (! can_return) yyerror("return used outside function context"); }

+ opt_exp statement_term

+ { $$ = node ($3, Node_K_return, (NODE *)NULL); }

+ | LEX_DELETE NAME '[' expression_list ']' statement_term

+ { $$ = node (variable($2,1), Node_K_delete, $4); }

+ | exp statement_term

+ { $$ = $1; }

+ ;

+print

+ : LEX_PRINT

+ { $$ = $1; }

+ | LEX_PRINTF

+ { $$ = $1; }

+ ;

+if_statement

+ : LEX_IF '(' exp r_paren opt_nls statement

+ {

+ $$ = node($3, Node_K_if,

+ node($6, Node_if_branches, (NODE *)NULL));

+ }

+ | LEX_IF '(' exp r_paren opt_nls statement

+ LEX_ELSE opt_nls statement

+ { $$ = node ($3, Node_K_if,

+ node ($6, Node_if_branches, $9)); }

+ ;

+nls

+ : NEWLINE

+ { want_assign = 0; }

+ | nls NEWLINE

+ ;

+opt_nls

+ : /* empty */

+ | nls

+ ;

+input_redir

+ : /* empty */

+ { $$ = NULL; }

+ | '<' simp_exp

+ { $$ = node ($2, Node_redirect_input, (NODE *)NULL); }

+ ;

+output_redir

+ : /* empty */

+ { $$ = NULL; }

+ | '>' exp

+ { $$ = node ($2, Node_redirect_output, (NODE *)NULL); }

+ | APPEND_OP exp

+ { $$ = node ($2, Node_redirect_append, (NODE *)NULL); }

+ | '|' exp

+ { $$ = node ($2, Node_redirect_pipe, (NODE *)NULL); }

+ ;

+opt_param_list

+ : /* empty */

+ { $$ = NULL; }

+ | param_list

+ { $$ = $1; }

+ ;

+param_list

+ : NAME

+ { $$ = make_param($1); }

+ | param_list comma NAME

+ { $$ = append_right($1, make_param($3)); yyerrok; }

+ | error

+ { $$ = NULL; }

+ | param_list error

+ { $$ = NULL; }

+ | param_list comma error

+ { $$ = NULL; }

+ ;

+/* optional expression, as in for loop */

+opt_exp

+ : /* empty */

+ { $$ = NULL; }

+ | exp

+ { $$ = $1; }

+ ;

+opt_rexpression_list

+ : /* empty */

+ { $$ = NULL; }

+ | rexpression_list

+ { $$ = $1; }

+ ;

+rexpression_list

+ : rexp

+ { $$ = node ($1, Node_expression_list, (NODE *)NULL); }

+ | rexpression_list comma rexp

+ {

+ $$ = append_right($1,

+ node( $3, Node_expression_list, (NODE *)NULL));

+ yyerrok;

+ }

+ | error

+ { $$ = NULL; }

+ | rexpression_list error

+ { $$ = NULL; }

+ | rexpression_list error rexp

+ { $$ = NULL; }

+ | rexpression_list comma error

+ { $$ = NULL; }

+ ;

+opt_expression_list

+ : /* empty */

+ { $$ = NULL; }

+ | expression_list

+ { $$ = $1; }

+ ;

+expression_list

+ : exp

+ { $$ = node ($1, Node_expression_list, (NODE *)NULL); }

+ | expression_list comma exp

+ {

+ $$ = append_right($1,

+ node( $3, Node_expression_list, (NODE *)NULL));

+ yyerrok;

+ }

+ | error

+ { $$ = NULL; }

+ | expression_list error

+ { $$ = NULL; }

+ | expression_list error exp

+ { $$ = NULL; }

+ | expression_list comma error

+ { $$ = NULL; }

+ ;

+/* Expressions, not including the comma operator. */

+exp : variable ASSIGNOP

+ { want_assign = 0; }

+ exp

+ {

+ if (do_lint && $4->type == Node_regex)

+ warning("Regular expression on left of assignment.");

+ $$ = node ($1, $2, $4);

+ }

+ | '(' expression_list r_paren LEX_IN NAME

+ { $$ = node (variable($5,1), Node_in_array, $2); }

+ | exp '|' LEX_GETLINE opt_variable

+ {

+ $$ = node ($4, Node_K_getline,

+ node ($1, Node_redirect_pipein, (NODE *)NULL));

+ }

+ | LEX_GETLINE opt_variable input_redir

+ {

+ if (do_lint && ! io_allowed && $3 == NULL)

+ warning("non-redirected getline undefined inside BEGIN or END action");

+ $$ = node ($2, Node_K_getline, $3);

+ }

+ | exp LEX_AND exp

+ { $$ = node ($1, Node_and, $3); }

+ | exp LEX_OR exp

+ { $$ = node ($1, Node_or, $3); }

+ | exp MATCHOP exp

+ {

+ if ($1->type == Node_regex)

+ warning("Regular expression on left of MATCH operator.");

+ $$ = node ($1, $2, mk_rexp($3));

+ }

+ | regexp

+ { $$ = $1; }

+ | '!' regexp %prec UNARY

+ {

+ $$ = node(node(make_number(0.0),

+ Node_field_spec,

+ (NODE *) NULL),

+ Node_nomatch,

+ $2);

+ }

+ | exp LEX_IN NAME

+ { $$ = node (variable($3,1), Node_in_array, $1); }

+ | exp RELOP exp

+ {

+ if (do_lint && $3->type == Node_regex)

+ warning("Regular expression on left of comparison.");

+ $$ = node ($1, $2, $3);

+ }

+ | exp '<' exp

+ { $$ = node ($1, Node_less, $3); }

+ | exp '>' exp

+ { $$ = node ($1, Node_greater, $3); }

+ | exp '?' exp ':' exp

+ { $$ = node($1, Node_cond_exp, node($3, Node_if_branches, $5));}

+ | simp_exp

+ { $$ = $1; }

+ | exp simp_exp %prec CONCAT_OP

+ { $$ = node ($1, Node_concat, $2); }

+ ;

+rexp

+ : variable ASSIGNOP

+ { want_assign = 0; }

+ rexp

+ { $$ = node ($1, $2, $4); }

+ | rexp LEX_AND rexp

+ { $$ = node ($1, Node_and, $3); }

+ | rexp LEX_OR rexp

+ { $$ = node ($1, Node_or, $3); }

+ | LEX_GETLINE opt_variable input_redir

+ {

+ if (do_lint && ! io_allowed && $3 == NULL)

+ warning("non-redirected getline undefined inside BEGIN or END action");

+ $$ = node ($2, Node_K_getline, $3);

+ }

+ | regexp

+ { $$ = $1; }

+ | '!' regexp %prec UNARY

+ { $$ = node((NODE *) NULL, Node_nomatch, $2); }

+ | rexp MATCHOP rexp

+ { $$ = node ($1, $2, mk_rexp($3)); }

+ | rexp LEX_IN NAME

+ { $$ = node (variable($3,1), Node_in_array, $1); }

+ | rexp RELOP rexp

+ { $$ = node ($1, $2, $3); }

+ | rexp '?' rexp ':' rexp

+ { $$ = node($1, Node_cond_exp, node($3, Node_if_branches, $5));}

+ | simp_exp

+ { $$ = $1; }

+ | rexp simp_exp %prec CONCAT_OP

+ { $$ = node ($1, Node_concat, $2); }

+ ;

+simp_exp

+ : non_post_simp_exp

+ /* Binary operators in order of decreasing precedence. */

+ | simp_exp '^' simp_exp

+ { $$ = node ($1, Node_exp, $3); }

+ | simp_exp '*' simp_exp

+ { $$ = node ($1, Node_times, $3); }

+ | simp_exp '/' simp_exp

+ { $$ = node ($1, Node_quotient, $3); }

+ | simp_exp '%' simp_exp

+ { $$ = node ($1, Node_mod, $3); }

+ | simp_exp '+' simp_exp

+ { $$ = node ($1, Node_plus, $3); }

+ | simp_exp '-' simp_exp

+ { $$ = node ($1, Node_minus, $3); }

+ | variable INCREMENT

+ { $$ = node ($1, Node_postincrement, (NODE *)NULL); }

+ | variable DECREMENT

+ { $$ = node ($1, Node_postdecrement, (NODE *)NULL); }

+ ;

+non_post_simp_exp

+ : '!' simp_exp %prec UNARY

+ { $$ = node ($2, Node_not,(NODE *) NULL); }

+ | '(' exp r_paren

+ { $$ = $2; }

+ | LEX_BUILTIN

+ '(' opt_expression_list r_paren

+ { $$ = snode ($3, Node_builtin, (int) $1); }

+ | LEX_LENGTH '(' opt_expression_list r_paren

+ { $$ = snode ($3, Node_builtin, (int) $1); }

+ | LEX_LENGTH

+ {

+ if (do_lint)

+ warning("call of `length' without parentheses is not portable");

+ $$ = snode ((NODE *)NULL, Node_builtin, (int) $1);

+ if (do_posix)

+ warning( "call of `length' without parentheses is deprecated by POSIX");

+ }

+ | FUNC_CALL '(' opt_expression_list r_paren

+ {

+ $$ = node ($3, Node_func_call, make_string($1, strlen($1)));

+ }

+ | variable

+ | INCREMENT variable

+ { $$ = node ($2, Node_preincrement, (NODE *)NULL); }

+ | DECREMENT variable

+ { $$ = node ($2, Node_predecrement, (NODE *)NULL); }

+ | YNUMBER

+ { $$ = $1; }

+ | YSTRING

+ { $$ = $1; }

+ | '-' simp_exp %prec UNARY

+ { if ($2->type == Node_val) {

+ $2->numbr = -(force_number($2));

+ $$ = $2;

+ } else

+ $$ = node ($2, Node_unary_minus, (NODE *)NULL);

+ }

+ | '+' simp_exp %prec UNARY

+ { $$ = $2; }

+ ;

+opt_variable

+ : /* empty */

+ { $$ = NULL; }

+ | variable

+ { $$ = $1; }

+ ;

+variable

+ : NAME

+ { $$ = variable($1,1); }

+ | NAME '[' expression_list ']'

+ {

+ if ($3->rnode == NULL) {

+ $$ = node (variable($1,1), Node_subscript, $3->lnode);

+ freenode($3);

+ } else

+ $$ = node (variable($1,1), Node_subscript, $3);

+ }

+ | '$' non_post_simp_exp

+ { $$ = node ($2, Node_field_spec, (NODE *)NULL); }

+ ;

+l_brace

+ : '{' opt_nls

+ ;

+r_brace

+ : '}' opt_nls { yyerrok; }

+ ;

+r_paren

+ : ')' { yyerrok; }

+ ;

+opt_semi

+ : /* empty */

+ | semi

+ ;

+semi

+ : ';' { yyerrok; want_assign = 0; }

+ ;

+comma : ',' opt_nls { yyerrok; }

+ ;

+%%

+struct token {

+ char *operator; /* text to match */

+ NODETYPE value; /* node type */

+ int class; /* lexical class */

+ unsigned flags; /* # of args. allowed and compatability */

+# define ARGS 0xFF /* 0, 1, 2, 3 args allowed (any combination */

+# define A(n) (1<<(n))

+# define VERSION 0xFF00 /* old awk is zero */

+# define NOT_OLD 0x0100 /* feature not in old awk */

+# define NOT_POSIX 0x0200 /* feature not in POSIX */

+# define GAWKX 0x0400 /* gawk extension */

+ NODE *(*ptr) (); /* function that implements this keyword */

+};

+extern NODE

+ *do_exp(), *do_getline(), *do_index(), *do_length(),

+ *do_sqrt(), *do_log(), *do_sprintf(), *do_substr(),

+ *do_split(), *do_system(), *do_int(), *do_close(),

+ *do_atan2(), *do_sin(), *do_cos(), *do_rand(),

+ *do_srand(), *do_match(), *do_tolower(), *do_toupper(),

+ *do_sub(), *do_gsub(), *do_strftime(), *do_systime();

+/* Tokentab is sorted ascii ascending order, so it can be binary searched. */

+static struct token tokentab[] = {

+{"BEGIN", Node_illegal, LEX_BEGIN, 0, 0},

+{"END", Node_illegal, LEX_END, 0, 0},

+{"atan2", Node_builtin, LEX_BUILTIN, NOT_OLD|A(2), do_atan2},

+{"break", Node_K_break, LEX_BREAK, 0, 0},

+{"close", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_close},

+{"continue", Node_K_continue, LEX_CONTINUE, 0, 0},

+{"cos", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_cos},

+{"delete", Node_K_delete, LEX_DELETE, NOT_OLD, 0},

+{"do", Node_K_do, LEX_DO, NOT_OLD, 0},

+{"else", Node_illegal, LEX_ELSE, 0, 0},

+{"exit", Node_K_exit, LEX_EXIT, 0, 0},

+{"exp", Node_builtin, LEX_BUILTIN, A(1), do_exp},

+{"for", Node_K_for, LEX_FOR, 0, 0},

+{"func", Node_K_function, LEX_FUNCTION, NOT_POSIX|NOT_OLD, 0},

+{"function", Node_K_function, LEX_FUNCTION, NOT_OLD, 0},

+{"getline", Node_K_getline, LEX_GETLINE, NOT_OLD, 0},

+{"gsub", Node_builtin, LEX_BUILTIN, NOT_OLD|A(2)|A(3), do_gsub},

+{"if", Node_K_if, LEX_IF, 0, 0},

+{"in", Node_illegal, LEX_IN, 0, 0},

+{"index", Node_builtin, LEX_BUILTIN, A(2), do_index},

+{"int", Node_builtin, LEX_BUILTIN, A(1), do_int},

+{"length", Node_builtin, LEX_LENGTH, A(0)|A(1), do_length},

+{"log", Node_builtin, LEX_BUILTIN, A(1), do_log},

+{"match", Node_builtin, LEX_BUILTIN, NOT_OLD|A(2), do_match},

+{"next", Node_K_next, LEX_NEXT, 0, 0},

+{"print", Node_K_print, LEX_PRINT, 0, 0},

+{"printf", Node_K_printf, LEX_PRINTF, 0, 0},

+{"rand", Node_builtin, LEX_BUILTIN, NOT_OLD|A(0), do_rand},

+{"return", Node_K_return, LEX_RETURN, NOT_OLD, 0},

+{"sin", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_sin},

+{"split", Node_builtin, LEX_BUILTIN, A(2)|A(3), do_split},

+{"sprintf", Node_builtin, LEX_BUILTIN, 0, do_sprintf},

+{"sqrt", Node_builtin, LEX_BUILTIN, A(1), do_sqrt},

+{"srand", Node_builtin, LEX_BUILTIN, NOT_OLD|A(0)|A(1), do_srand},

+{"strftime", Node_builtin, LEX_BUILTIN, GAWKX|A(1)|A(2), do_strftime},

+{"sub", Node_builtin, LEX_BUILTIN, NOT_OLD|A(2)|A(3), do_sub},

+{"substr", Node_builtin, LEX_BUILTIN, A(2)|A(3), do_substr},

+{"system", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_system},

+{"systime", Node_builtin, LEX_BUILTIN, GAWKX|A(0), do_systime},

+{"tolower", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_tolower},

+{"toupper", Node_builtin, LEX_BUILTIN, NOT_OLD|A(1), do_toupper},

+{"while", Node_K_while, LEX_WHILE, 0, 0},

+};

+/* VARARGS0 */

+static void

+yyerror(va_alist)

+va_dcl

+ va_list args;

+ char *mesg = NULL;

+ register char *bp, *cp;

+ char *scan;

+ char buf[120];

+ errcount++;

+ /* Find the current line in the input file */

+ if (lexptr) {

+ if (!thisline) {

+ cp = lexeme;

+ if (*cp == '\n') {

+ cp--;

+ mesg = "unexpected newline";

+ }

+ for ( ; cp != lexptr_begin && *cp != '\n'; --cp)

+ ;

+ if (*cp == '\n')

+ cp++;

+ thisline = cp;

+ }

+ /* NL isn't guaranteed */

+ bp = lexeme;

+ while (bp < lexend && *bp && *bp != '\n')

+ bp++;

+ } else {

+ thisline = "(END OF FILE)";

+ bp = thisline + 13;

+ }

+ msg("%.*s", (int) (bp - thisline), thisline);

+ bp = buf;

+ cp = buf + sizeof(buf) - 24; /* 24 more than longest msg. input */

+ if (lexptr) {

+ scan = thisline;

+ while (bp < cp && scan < lexeme)

+ if (*scan++ == '\t')

+ *bp++ = '\t';

+ else

+ *bp++ = ' ';

+ *bp++ = '^';

+ *bp++ = ' ';

+ }

+ va_start(args);

+ if (mesg == NULL)

+ mesg = va_arg(args, char *);

+ strcpy(bp, mesg);

+ err("", buf, args);

+ va_end(args);

+ exit(2);

+static char *

+get_src_buf()

+ static int samefile = 0;

+ static int nextfile = 0;

+ static char *buf = NULL;

+ static int fd;

+ int n;

+ register char *scan;

+ static int len = 0;

+ static int did_newline = 0;

+# define SLOP 128 /* enough space to hold most source lines */

+ if (nextfile > numfiles)

+ return NULL;

+ if (srcfiles[nextfile].stype == CMDLINE) {

+ if (len == 0) {

+ len = strlen(srcfiles[nextfile].val);

+ sourceline = 1;

+ lexptr = lexptr_begin = srcfiles[nextfile].val;

+ lexend = lexptr + len;

+ } else if (!did_newline && *(lexptr-1) != '\n') {

+ /*

+ * The following goop is to ensure that the source

+ * ends with a newline and that the entire current

+ * line is available for error messages.

+ */

+ int offset;

+ did_newline = 1;

+ offset = lexptr - lexeme;

+ for (scan = lexeme; scan > lexptr_begin; scan--)

+ if (*scan == '\n') {

+ scan++;

+ break;

+ }

+ len = lexptr - scan;

+ emalloc(buf, char *, len+1, "get_src_buf");

+ memcpy(buf, scan, len);

+ thisline = buf;

+ lexptr = buf + len;

+ *lexptr = '\n';

+ lexeme = lexptr - offset;

+ lexptr_begin = buf;

+ lexend = lexptr + 1;

+ } else {

+ len = 0;

+ lexeme = lexptr = lexptr_begin = NULL;

+ }

+ if (lexptr == NULL && ++nextfile <= numfiles)

+ return get_src_buf();

+ return lexptr;

+ }

+ if (!samefile) {

+ source = srcfiles[nextfile].val;

+ if (source == NULL) {

+ if (buf) {

+ free(buf);

+ buf = NULL;

+ }

+ len = 0;

+ return lexeme = lexptr = lexptr_begin = NULL;

+ }

+ fd = pathopen(source);

+ if (fd == -1)

+ fatal("can't open source file \"%s\" for reading (%s)",

+ source, strerror(errno));

+ len = optimal_bufsize(fd);

+ if (buf)

+ free(buf);

+ emalloc(buf, char *, len + SLOP, "get_src_buf");

+ lexptr_begin = buf + SLOP;

+ samefile = 1;

+ sourceline = 1;

+ } else {

+ /*

+ * Here, we retain the current source line (up to length SLOP)

+ * in the beginning of the buffer that was overallocated above

+ */

+ int offset;

+ int linelen;

+ offset = lexptr - lexeme;

+ for (scan = lexeme; scan > lexptr_begin; scan--)

+ if (*scan == '\n') {

+ scan++;

+ break;

+ }

+ linelen = lexptr - scan;

+ if (linelen > SLOP)

+ linelen = SLOP;

+ thisline = buf + SLOP - linelen;

+ memcpy(thisline, scan, linelen);

+ lexeme = buf + SLOP - offset;

+ lexptr_begin = thisline;

+ }

+ n = read(fd, buf + SLOP, len);

+ if (n == -1)

+ fatal("can't read sourcefile \"%s\" (%s)",

+ source, strerror(errno));

+ if (n == 0) {

+ samefile = 0;

+ nextfile++;

+ len = 0;

+ return get_src_buf();

+ }

+ lexptr = buf + SLOP;

+ lexend = lexptr + n;

+ return buf;

+#define tokadd(x) (*token++ = (x), token == tokend ? tokexpand() : token)

+char *

+tokexpand()

+ static int toksize = 60;

+ int tokoffset;

+ tokoffset = token - tokstart;

+ toksize *= 2;

+ if (tokstart)

+ erealloc(tokstart, char *, toksize, "tokexpand");

+ else

+ emalloc(tokstart, char *, toksize, "tokexpand");

+ tokend = tokstart + toksize;

+ token = tokstart + tokoffset;

+ return token;

+#if DEBUG

+char

+nextc() {

+ if (lexptr && lexptr < lexend)

+ return *lexptr++;

+ else if (get_src_buf())

+ return *lexptr++;

+ else

+ return '\0';

+#else

+#define nextc() ((lexptr && lexptr < lexend) ? \

+ *lexptr++ : \

+ (get_src_buf() ? *lexptr++ : '\0') \

+ )

+#endif

+#define pushback() (lexptr && lexptr > lexptr_begin ? lexptr-- : lexptr)

+/*

+ * Read the input and turn it into tokens.

+ */

+static int

+yylex()

+ register int c;

+ int seen_e = 0; /* These are for numbers */

+ int seen_point = 0;

+ int esc_seen; /* for literal strings */

+ int low, mid, high;

+ static int did_newline = 0;

+ char *tokkey;

+ if (!nextc())

+ return 0;

+ pushback();

+ lexeme = lexptr;

+ thisline = NULL;

+ if (want_regexp) {

+ int in_brack = 0;

+ want_regexp = 0;

+ token = tokstart;

+ while ((c = nextc()) != 0) {

+ switch (c) {

+ case '[':

+ in_brack = 1;

+ break;

+ case ']':

+ in_brack = 0;

+ break;

+ case '\\':

+ if ((c = nextc()) == '\0') {

+ yyerror("unterminated regexp ends with \\ at end of file");

+ } else if (c == '\n') {

+ sourceline++;

+ continue;

+ } else

+ tokadd('\\');

+ break;

+ case '/': /* end of the regexp */

+ if (in_brack)

+ break;

+ pushback();

+ tokadd('\0');

+ yylval.sval = tokstart;

+ return REGEXP;

+ case '\n':

+ pushback();

+ yyerror("unterminated regexp");

+ case '\0':

+ yyerror("unterminated regexp at end of file");

+ }

+ tokadd(c);

+ }

+retry:

+ while ((c = nextc()) == ' ' || c == '\t')

+ ;

+ lexeme = lexptr ? lexptr - 1 : lexptr;

+ thisline = NULL;

+ token = tokstart;

+ yylval.nodetypeval = Node_illegal;

+ switch (c) {

+ case 0:

+ return 0;

+ case '\n':

+ sourceline++;

+ return NEWLINE;

+ case '#': /* it's a comment */

+ while ((c = nextc()) != '\n') {

+ if (c == '\0')

+ return 0;

+ }

+ sourceline++;

+ return NEWLINE;

+ case '\\':

+#ifdef RELAXED_CONTINUATION

+ if (!do_unix) { /* strip trailing white-space and/or comment */

+ while ((c = nextc()) == ' ' || c == '\t') continue;

+ if (c == '#')

+ while ((c = nextc()) != '\n') if (!c) break;

+ pushback();

+ }

+#endif /*RELAXED_CONTINUATION*/

+ if (nextc() == '\n') {

+ sourceline++;

+ goto retry;

+ } else

+ yyerror("inappropriate use of backslash");

+ break;

+ case '$':

+ want_assign = 1;

+ return '$';

+ case ')':

+ case ']':

+ case '(':

+ case '[':

+ case ';':

+ case ':':

+ case '?':

+ case '{':

+ case ',':

+ return c;

+ case '*':

+ if ((c = nextc()) == '=') {

+ yylval.nodetypeval = Node_assign_times;

+ return ASSIGNOP;

+ } else if (do_posix) {

+ pushback();

+ return '*';

+ } else if (c == '*') {

+ /* make ** and **= aliases for ^ and ^= */

+ static int did_warn_op = 0, did_warn_assgn = 0;

+ if (nextc() == '=') {

+ if (do_lint && ! did_warn_assgn) {

+ did_warn_assgn = 1;

+ warning("**= is not allowed by POSIX");

+ }

+ yylval.nodetypeval = Node_assign_exp;

+ return ASSIGNOP;

+ } else {

+ pushback();

+ if (do_lint && ! did_warn_op) {

+ did_warn_op = 1;

+ warning("** is not allowed by POSIX");

+ }

+ return '^';

+ }

+ pushback();

+ return '*';

+ case '/':

+ if (want_assign) {

+ if (nextc() == '=') {

+ yylval.nodetypeval = Node_assign_quotient;

+ return ASSIGNOP;

+ }

+ pushback();

+ }

+ return '/';

+ case '%':

+ if (nextc() == '=') {

+ yylval.nodetypeval = Node_assign_mod;

+ return ASSIGNOP;

+ }

+ pushback();

+ return '%';

+ case '^':

+ {

+ static int did_warn_op = 0, did_warn_assgn = 0;

+ if (nextc() == '=') {

+ if (do_lint && ! did_warn_assgn) {

+ did_warn_assgn = 1;

+ warning("operator `^=' is not supported in old awk");

+ }

+ yylval.nodetypeval = Node_assign_exp;

+ return ASSIGNOP;

+ }

+ pushback();

+ if (do_lint && ! did_warn_op) {

+ did_warn_op = 1;

+ warning("operator `^' is not supported in old awk");

+ }

+ return '^';

+ }

+ case '+':

+ if ((c = nextc()) == '=') {

+ yylval.nodetypeval = Node_assign_plus;

+ return ASSIGNOP;

+ }

+ if (c == '+')

+ return INCREMENT;

+ pushback();

+ return '+';

+ case '!':

+ if ((c = nextc()) == '=') {

+ yylval.nodetypeval = Node_notequal;

+ return RELOP;

+ }

+ if (c == '~') {

+ yylval.nodetypeval = Node_nomatch;

+ want_assign = 0;

+ return MATCHOP;

+ }

+ pushback();

+ return '!';

+ case '<':

+ if (nextc() == '=') {

+ yylval.nodetypeval = Node_leq;

+ return RELOP;

+ }

+ yylval.nodetypeval = Node_less;

+ pushback();

+ return '<';

+ case '=':

+ if (nextc() == '=') {

+ yylval.nodetypeval = Node_equal;

+ return RELOP;

+ }

+ yylval.nodetypeval = Node_assign;

+ pushback();

+ return ASSIGNOP;

+ case '>':

+ if ((c = nextc()) == '=') {

+ yylval.nodetypeval = Node_geq;

+ return RELOP;

+ } else if (c == '>') {

+ yylval.nodetypeval = Node_redirect_append;

+ return APPEND_OP;

+ }

+ yylval.nodetypeval = Node_greater;

+ pushback();

+ return '>';

+ case '~':

+ yylval.nodetypeval = Node_match;

+ want_assign = 0;

+ return MATCHOP;

+ case '}':

+ /*

+ * Added did newline stuff. Easier than

+ * hacking the grammar

+ */

+ if (did_newline) {

+ did_newline = 0;

+ return c;

+ }

+ did_newline++;

+ --lexptr; /* pick up } next time */

+ return NEWLINE;

+ case '"':

+ esc_seen = 0;

+ while ((c = nextc()) != '"') {

+ if (c == '\n') {

+ pushback();

+ yyerror("unterminated string");

+ }

+ if (c == '\\') {

+ c = nextc();

+ if (c == '\n') {

+ sourceline++;

+ continue;

+ }

+ esc_seen = 1;

+ tokadd('\\');

+ }

+ if (c == '\0') {

+ pushback();

+ yyerror("unterminated string");

+ }

+ tokadd(c);

+ }

+ yylval.nodeval = make_str_node(tokstart,

+ token - tokstart, esc_seen ? SCAN : 0);

+ yylval.nodeval->flags |= PERM;

+ return YSTRING;

+ case '-':

+ if ((c = nextc()) == '=') {

+ yylval.nodetypeval = Node_assign_minus;

+ return ASSIGNOP;

+ }

+ if (c == '-')

+ return DECREMENT;

+ pushback();

+ return '-';

+ case '.':

+ c = nextc();

+ pushback();

+ if (!isdigit(c))

+ return '.';

+ else

+ c = '.'; /* FALL THROUGH */

+ case '0':

+ case '1':

+ case '2':

+ case '3':

+ case '4':

+ case '5':

+ case '6':

+ case '7':

+ case '8':

+ case '9':

+ /* It's a number */

+ for (;;) {

+ int gotnumber = 0;

+ tokadd(c);

+ switch (c) {

+ case '.':

+ if (seen_point) {

+ gotnumber++;

+ break;

+ }

+ ++seen_point;

+ break;

+ case 'e':

+ case 'E':

+ if (seen_e) {

+ gotnumber++;

+ break;

+ }

+ ++seen_e;

+ if ((c = nextc()) == '-' || c == '+')

+ tokadd(c);

+ else

+ pushback();

+ break;

+ case '0':

+ case '1':

+ case '2':

+ case '3':

+ case '4':

+ case '5':

+ case '6':

+ case '7':

+ case '8':

+ case '9':

+ break;

+ default:

+ gotnumber++;

+ }

+ if (gotnumber)

+ break;

+ c = nextc();

+ }

+ pushback();

+ yylval.nodeval = make_number(atof(tokstart));

+ yylval.nodeval->flags |= PERM;

+ return YNUMBER;

+ case '&':

+ if ((c = nextc()) == '&') {

+ yylval.nodetypeval = Node_and;

+ for (;;) {

+ c = nextc();

+ if (c == '\0')

+ break;

+ if (c == '#') {

+ while ((c = nextc()) != '\n' && c != '\0')

+ ;

+ if (c == '\0')

+ break;

+ }

+ if (c == '\n')

+ sourceline++;

+ if (! isspace(c)) {

+ pushback();

+ break;

+ }

+ want_assign = 0;

+ return LEX_AND;

+ }

+ pushback();

+ return '&';

+ case '|':

+ if ((c = nextc()) == '|') {

+ yylval.nodetypeval = Node_or;

+ for (;;) {

+ c = nextc();

+ if (c == '\0')

+ break;

+ if (c == '#') {

+ while ((c = nextc()) != '\n' && c != '\0')

+ ;

+ if (c == '\0')

+ break;

+ }

+ if (c == '\n')

+ sourceline++;

+ if (! isspace(c)) {

+ pushback();

+ break;

+ }

+ want_assign = 0;

+ return LEX_OR;

+ }

+ pushback();

+ return '|';

+ }

+ if (c != '_' && ! isalpha(c))

+ yyerror("Invalid char '%c' in expression\n", c);

+ /* it's some type of name-type-thing. Find its length */

+ token = tokstart;

+ while (is_identchar(c)) {

+ tokadd(c);

+ c = nextc();

+ }

+ tokadd('\0');

+ emalloc(tokkey, char *, token - tokstart, "yylex");

+ memcpy(tokkey, tokstart, token - tokstart);

+ pushback();

+ /* See if it is a special token. */

+ low = 0;

+ high = (sizeof (tokentab) / sizeof (tokentab[0])) - 1;

+ while (low <= high) {

+ int i/* , c */;

+ mid = (low + high) / 2;

+ c = *tokstart - tokentab[mid].operator[0];

+ i = c ? c : strcmp (tokstart, tokentab[mid].operator);

+ if (i < 0) { /* token < mid */

+ high = mid - 1;

+ } else if (i > 0) { /* token > mid */

+ low = mid + 1;

+ } else {

+ if (do_lint) {

+ if (tokentab[mid].flags & GAWKX)

+ warning("%s() is a gawk extension",

+ tokentab[mid].operator);

+ if (tokentab[mid].flags & NOT_POSIX)

+ warning("POSIX does not allow %s",

+ tokentab[mid].operator);

+ if (tokentab[mid].flags & NOT_OLD)

+ warning("%s is not supported in old awk",

+ tokentab[mid].operator);

+ }

+ if ((do_unix && (tokentab[mid].flags & GAWKX))

+ || (do_posix && (tokentab[mid].flags & NOT_POSIX)))

+ break;

+ if (tokentab[mid].class == LEX_BUILTIN

+ || tokentab[mid].class == LEX_LENGTH

+ )

+ yylval.lval = mid;

+ else

+ yylval.nodetypeval = tokentab[mid].value;

+ return tokentab[mid].class;

+ }

+ yylval.sval = tokkey;

+ if (*lexptr == '(')

+ return FUNC_CALL;

+ else {

+ want_assign = 1;

+ return NAME;

+ }

+static NODE *

+node_common(op)

+NODETYPE op;

+ register NODE *r;

+ getnode(r);

+ r->type = op;

+ r->flags = MALLOC;

+ /* if lookahead is NL, lineno is 1 too high */

+ if (lexeme && *lexeme == '\n')

+ r->source_line = sourceline - 1;

+ else

+ r->source_line = sourceline;

+ r->source_file = source;

+ return r;

+/*

+ * This allocates a node with defined lnode and rnode.

+ */

+NODE *

+node(left, op, right)

+NODE *left, *right;

+NODETYPE op;

+ register NODE *r;

+ r = node_common(op);

+ r->lnode = left;

+ r->rnode = right;

+ return r;

+/*

+ * This allocates a node with defined subnode and proc for builtin functions

+ * Checks for arg. count and supplies defaults where possible.

+ */

+static NODE *

+snode(subn, op, idx)

+NODETYPE op;

+int idx;

+NODE *subn;

+ register NODE *r;

+ register NODE *n;

+ int nexp = 0;

+ int args_allowed;

+ r = node_common(op);

+ /* traverse expression list to see how many args. given */

+ for (n= subn; n; n= n->rnode) {

+ nexp++;

+ if (nexp > 3)

+ break;

+ }

+ /* check against how many args. are allowed for this builtin */

+ args_allowed = tokentab[idx].flags & ARGS;

+ if (args_allowed && !(args_allowed & A(nexp)))

+ fatal("%s() cannot have %d argument%c",

+ tokentab[idx].operator, nexp, nexp == 1 ? ' ' : 's');

+ r->proc = tokentab[idx].ptr;

+ /* special case processing for a few builtins */

+ if (nexp == 0 && r->proc == do_length) {

+ subn = node(node(make_number(0.0),Node_field_spec,(NODE *)NULL),

+ Node_expression_list,

+ (NODE *) NULL);

+ } else if (r->proc == do_match) {

+ if (subn->rnode->lnode->type != Node_regex)

+ subn->rnode->lnode = mk_rexp(subn->rnode->lnode);

+ } else if (r->proc == do_sub || r->proc == do_gsub) {

+ if (subn->lnode->type != Node_regex)

+ subn->lnode = mk_rexp(subn->lnode);

+ if (nexp == 2)

+ append_right(subn, node(node(make_number(0.0),

+ Node_field_spec,

+ (NODE *) NULL),

+ Node_expression_list,

+ (NODE *) NULL));

+ else if (do_lint && subn->rnode->rnode->lnode->type == Node_val)

+ warning("string literal as last arg of substitute");

+ } else if (r->proc == do_split) {

+ if (nexp == 2)

+ append_right(subn,

+ node(FS_node, Node_expression_list, (NODE *) NULL));

+ n = subn->rnode->rnode->lnode;

+ if (n->type != Node_regex)

+ subn->rnode->rnode->lnode = mk_rexp(n);

+ if (nexp == 2)

+ subn->rnode->rnode->lnode->re_flags |= FS_DFLT;

+ }

+ r->subnode = subn;

+ return r;

+/*

+ * This allocates a Node_line_range node with defined condpair and

+ * zeroes the trigger word to avoid the temptation of assuming that calling

+ * 'node( foo, Node_line_range, 0)' will properly initialize 'triggered'.

+ */

+/* Otherwise like node() */

+static NODE *

+mkrangenode(cpair)

+NODE *cpair;

+ register NODE *r;

+ getnode(r);

+ r->type = Node_line_range;

+ r->condpair = cpair;

+ r->triggered = 0;

+ return r;

+/* Build a for loop */

+static NODE *

+make_for_loop(init, cond, incr)

+NODE *init, *cond, *incr;

+ register FOR_LOOP_HEADER *r;

+ NODE *n;

+ emalloc(r, FOR_LOOP_HEADER *, sizeof(FOR_LOOP_HEADER), "make_for_loop");

+ getnode(n);

+ n->type = Node_illegal;

+ r->init = init;

+ r->cond = cond;

+ r->incr = incr;

+ n->sub.nodep.r.hd = r;

+ return n;

+/*

+ * Install a name in the symbol table, even if it is already there.

+ * Caller must check against redefinition if that is desired.

+ */

+NODE *

+install(name, value)

+char *name;

+NODE *value;

+ register NODE *hp;

+ register int len, bucket;

+ len = strlen(name);

+ bucket = hash(name, len);

+ getnode(hp);

+ hp->type = Node_hashnode;

+ hp->hnext = variables[bucket];

+ variables[bucket] = hp;

+ hp->hlength = len;

+ hp->hvalue = value;

+ hp->hname = name;

+ hp->hvalue->vname = name;

+ return hp->hvalue;

+/* find the most recent hash node for name installed by install */

+NODE *

+lookup(name)

+char *name;

+ register NODE *bucket;

+ register int len;

+ len = strlen(name);

+ bucket = variables[hash(name, len)];

+ while (bucket) {

+ if (bucket->hlength == len && STREQN(bucket->hname, name, len))

+ return bucket->hvalue;

+ bucket = bucket->hnext;

+ }

+ return NULL;

+/*

+ * Add new to the rightmost branch of LIST. This uses n^2 time, so we make

+ * a simple attempt at optimizing it.

+ */

+static NODE *

+append_right(list, new)

+NODE *list, *new;

+ register NODE *oldlist;

+ static NODE *savefront = NULL, *savetail = NULL;

+ oldlist = list;

+ if (savefront == oldlist) {

+ savetail = savetail->rnode = new;

+ return oldlist;

+ } else

+ savefront = oldlist;

+ while (list->rnode != NULL)

+ list = list->rnode;

+ savetail = list->rnode = new;

+ return oldlist;

+/*

+ * check if name is already installed; if so, it had better have Null value,

+ * in which case def is added as the value. Otherwise, install name with def

+ * as value.

+ */

+static void

+func_install(params, def)

+NODE *params;

+NODE *def;

+ NODE *r;

+ pop_params(params->rnode);

+ pop_var(params, 0);

+ r = lookup(params->param);

+ if (r != NULL) {

+ fatal("function name `%s' previously defined", params->param);

+ } else

+ (void) install(params->param, node(params, Node_func, def));

+static void

+pop_var(np, freeit)

+NODE *np;

+int freeit;

+ register NODE *bucket, **save;

+ register int len;

+ char *name;

+ name = np->param;

+ len = strlen(name);

+ save = &(variables[hash(name, len)]);

+ for (bucket = *save; bucket; bucket = bucket->hnext) {

+ if (len == bucket->hlength && STREQN(bucket->hname, name, len)) {

+ *save = bucket->hnext;

+ freenode(bucket);

+ if (freeit)

+ free(np->param);

+ return;

+ }

+ save = &(bucket->hnext);

+ }

+static void

+pop_params(params)

+NODE *params;

+ register NODE *np;

+ for (np = params; np != NULL; np = np->rnode)

+ pop_var(np, 1);

+static NODE *

+make_param(name)

+char *name;

+ NODE *r;

+ getnode(r);

+ r->type = Node_param_list;

+ r->rnode = NULL;

+ r->param = name;

+ r->param_cnt = param_counter++;

+ return (install(name, r));

+/* Name points to a variable name. Make sure its in the symbol table */

+NODE *

+variable(name, can_free)

+char *name;

+int can_free;

+ register NODE *r;

+ static int env_loaded = 0;

+ if (!env_loaded && STREQ(name, "ENVIRON")) {

+ load_environ();

+ env_loaded = 1;

+ }

+ if ((r = lookup(name)) == NULL)

+ r = install(name, node(Nnull_string, Node_var, (NODE *) NULL));

+ else if (can_free)

+ free(name);

+ return r;

+static NODE *

+mk_rexp(exp)

+NODE *exp;

+ if (exp->type == Node_regex)

+ return exp;

+ else {

+ NODE *n;

+ getnode(n);

+ n->type = Node_regex;

+ n->re_exp = exp;

+ n->re_text = NULL;

+ n->re_reg = NULL;

+ n->re_flags = 0;

+ n->re_cnt = 1;

+ return n;

+ }

diff --git a/gnu/usr.bin/awk/builtin.c b/gnu/usr.bin/awk/builtin.c
new file mode 100644
index 000000000000..9d5e3b302fde
--- /dev/null
+++ b/gnu/usr.bin/awk/builtin.c

@@ -0,0 +1,1133 @@

+/*

+ * builtin.c - Builtin functions and various utility procedures

+ */

+/*

+ *

+ * This file is part of GAWK, the GNU implementation of the

+ * AWK Progamming Language.

+ *

+ * GAWK is free software; you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation; either version 2 of the License, or

+ * (at your option) any later version.

+ *

+ * GAWK is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GAWK; see the file COPYING. If not, write to

+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+ */

+#include "awk.h"

+#ifndef SRANDOM_PROTO

+extern void srandom P((int seed));

+#endif

+#ifndef linux

+extern char *initstate P((unsigned seed, char *state, int n));

+extern char *setstate P((char *state));

+extern long random P((void));

+#endif

+extern NODE **fields_arr;

+extern int output_is_tty;

+static NODE *sub_common P((NODE *tree, int global));

+#ifdef GFMT_WORKAROUND

+char *gfmt P((double g, int prec, char *buf));

+#endif

+#ifdef _CRAY

+/* Work around a problem in conversion of doubles to exact integers. */

+#include <float.h>

+#define Floor(n) floor((n) * (1.0 + DBL_EPSILON))

+#define Ceil(n) ceil((n) * (1.0 + DBL_EPSILON))

+/* Force the standard C compiler to use the library math functions. */

+extern double exp(double);

+double (*Exp)() = exp;

+#define exp(x) (*Exp)(x)

+extern double log(double);

+double (*Log)() = log;

+#define log(x) (*Log)(x)

+#else

+#define Floor(n) floor(n)

+#define Ceil(n) ceil(n)

+#endif

+static void

+efwrite(ptr, size, count, fp, from, rp, flush)

+void *ptr;

+unsigned size, count;

+FILE *fp;

+char *from;

+struct redirect *rp;

+int flush;

+ errno = 0;

+ if (fwrite(ptr, size, count, fp) != count)

+ goto wrerror;

+ if (flush

+ && ((fp == stdout && output_is_tty)

+ || (rp && (rp->flag & RED_NOBUF)))) {

+ fflush(fp);

+ if (ferror(fp))

+ goto wrerror;

+ }

+ return;

+ wrerror:

+ fatal("%s to \"%s\" failed (%s)", from,

+ rp ? rp->value : "standard output",

+ errno ? strerror(errno) : "reason unknown");

+/* Builtin functions */

+NODE *

+do_exp(tree)

+NODE *tree;

+ NODE *tmp;

+ double d, res;

+#ifndef exp

+ double exp P((double));

+#endif

+ tmp= tree_eval(tree->lnode);

+ d = force_number(tmp);

+ free_temp(tmp);

+ errno = 0;

+ res = exp(d);

+ if (errno == ERANGE)

+ warning("exp argument %g is out of range", d);

+ return tmp_number((AWKNUM) res);

+NODE *

+do_index(tree)

+NODE *tree;

+ NODE *s1, *s2;

+ register char *p1, *p2;

+ register int l1, l2;

+ long ret;

+ s1 = tree_eval(tree->lnode);

+ s2 = tree_eval(tree->rnode->lnode);

+ force_string(s1);

+ force_string(s2);

+ p1 = s1->stptr;

+ p2 = s2->stptr;

+ l1 = s1->stlen;

+ l2 = s2->stlen;

+ ret = 0;

+ if (IGNORECASE) {

+ while (l1) {

+ if (l2 > l1)

+ break;

+ if (casetable[(int)*p1] == casetable[(int)*p2]

+ && (l2 == 1 || strncasecmp(p1, p2, l2) == 0)) {

+ ret = 1 + s1->stlen - l1;

+ break;

+ }

+ l1--;

+ p1++;

+ }

+ } else {

+ while (l1) {

+ if (l2 > l1)

+ break;

+ if (*p1 == *p2

+ && (l2 == 1 || STREQN(p1, p2, l2))) {

+ ret = 1 + s1->stlen - l1;

+ break;

+ }

+ l1--;

+ p1++;

+ }

+ free_temp(s1);

+ free_temp(s2);

+ return tmp_number((AWKNUM) ret);

+NODE *

+do_int(tree)

+NODE *tree;

+ NODE *tmp;

+ double floor P((double));

+ double ceil P((double));

+ double d;

+ tmp = tree_eval(tree->lnode);

+ d = force_number(tmp);

+ if (d >= 0)

+ d = Floor(d);

+ else

+ d = Ceil(d);

+ free_temp(tmp);

+ return tmp_number((AWKNUM) d);

+NODE *

+do_length(tree)

+NODE *tree;

+ NODE *tmp;

+ int len;

+ tmp = tree_eval(tree->lnode);

+ len = force_string(tmp)->stlen;

+ free_temp(tmp);

+ return tmp_number((AWKNUM) len);

+NODE *

+do_log(tree)

+NODE *tree;

+ NODE *tmp;

+#ifndef log

+ double log P((double));

+#endif

+ double d, arg;

+ tmp = tree_eval(tree->lnode);

+ arg = (double) force_number(tmp);

+ if (arg < 0.0)

+ warning("log called with negative argument %g", arg);

+ d = log(arg);

+ free_temp(tmp);

+ return tmp_number((AWKNUM) d);

+/* %e and %f formats are not properly implemented. Someone should fix them */

+/* Actually, this whole thing should be reimplemented. */

+NODE *

+do_sprintf(tree)

+NODE *tree;

+#define bchunk(s,l) if(l) {\

+ while((l)>ofre) {\

+ erealloc(obuf, char *, osiz*2, "do_sprintf");\

+ ofre+=osiz;\

+ osiz*=2;\

+ }\

+ memcpy(obuf+olen,s,(l));\

+ olen+=(l);\

+ ofre-=(l);\

+ }

+ /* Is there space for something L big in the buffer? */

+#define chksize(l) if((l)>ofre) {\

+ erealloc(obuf, char *, osiz*2, "do_sprintf");\

+ ofre+=osiz;\

+ osiz*=2;\

+ }

+ /*

+ * Get the next arg to be formatted. If we've run out of args,

+ * return "" (Null string)

+ */

+#define parse_next_arg() {\

+ if(!carg) { toofew = 1; break; }\

+ else {\

+ arg=tree_eval(carg->lnode);\

+ carg=carg->rnode;\

+ }\

+ }

+ NODE *r;

+ int toofew = 0;

+ char *obuf;

+ int osiz, ofre, olen;

+ static char chbuf[] = "0123456789abcdef";

+ static char sp[] = " ";

+ char *s0, *s1;

+ int n0;

+ NODE *sfmt, *arg;

+ register NODE *carg;

+ long fw, prec, lj, alt, big;

+ long *cur;

+ long val;

+#ifdef sun386 /* Can't cast unsigned (int/long) from ptr->value */

+ long tmp_uval; /* on 386i 4.0.1 C compiler -- it just hangs */

+#endif

+ unsigned long uval;

+ int sgn;

+ int base;

+ char cpbuf[30]; /* if we have numbers bigger than 30 */

+ char *cend = &cpbuf[30];/* chars, we lose, but seems unlikely */

+ char *cp;

+ char *fill;

+ double tmpval;

+ char *pr_str;

+ int ucasehex = 0;

+ char signchar = 0;

+ int len;

+ emalloc(obuf, char *, 120, "do_sprintf");

+ osiz = 120;

+ ofre = osiz - 1;

+ olen = 0;

+ sfmt = tree_eval(tree->lnode);

+ sfmt = force_string(sfmt);

+ carg = tree->rnode;

+ for (s0 = s1 = sfmt->stptr, n0 = sfmt->stlen; n0-- > 0;) {

+ if (*s1 != '%') {

+ s1++;

+ continue;

+ }

+ bchunk(s0, s1 - s0);

+ s0 = s1;

+ cur = &fw;

+ fw = 0;

+ prec = 0;

+ lj = alt = big = 0;

+ fill = sp;

+ cp = cend;

+ s1++;

+retry:

+ --n0;

+ switch (*s1++) {

+ case '%':

+ bchunk("%", 1);

+ s0 = s1;

+ break;

+ case '0':

+ if (fill != sp || lj)

+ goto lose;

+ if (cur == &fw)

+ fill = "0"; /* FALL through */

+ case '1':

+ case '2':

+ case '3':

+ case '4':

+ case '5':

+ case '6':

+ case '7':

+ case '8':

+ case '9':

+ if (cur == 0)

+ goto lose;

+ *cur = s1[-1] - '0';

+ while (n0 > 0 && *s1 >= '0' && *s1 <= '9') {

+ --n0;

+ *cur = *cur * 10 + *s1++ - '0';

+ }

+ goto retry;

+ case '*':

+ if (cur == 0)

+ goto lose;

+ parse_next_arg();

+ *cur = force_number(arg);

+ free_temp(arg);

+ goto retry;

+ case ' ': /* print ' ' or '-' */

+ case '+': /* print '+' or '-' */

+ signchar = *(s1-1);

+ goto retry;

+ case '-':

+ if (lj || fill != sp)

+ goto lose;

+ lj++;

+ goto retry;

+ case '.':

+ if (cur != &fw)

+ goto lose;

+ cur = &prec;

+ goto retry;

+ case '#':

+ if (alt)

+ goto lose;

+ alt++;

+ goto retry;

+ case 'l':

+ if (big)

+ goto lose;

+ big++;

+ goto retry;

+ case 'c':

+ parse_next_arg();

+ if (arg->flags & NUMBER) {

+#ifdef sun386

+ tmp_uval = arg->numbr;

+ uval= (unsigned long) tmp_uval;

+#else

+ uval = (unsigned long) arg->numbr;

+#endif

+ cpbuf[0] = uval;

+ prec = 1;

+ pr_str = cpbuf;

+ goto dopr_string;

+ }

+ if (! prec)

+ prec = 1;

+ else if (prec > arg->stlen)

+ prec = arg->stlen;

+ pr_str = arg->stptr;

+ goto dopr_string;

+ case 's':

+ parse_next_arg();

+ arg = force_string(arg);

+ if (!prec || prec > arg->stlen)

+ prec = arg->stlen;

+ pr_str = arg->stptr;

+ dopr_string:

+ if (fw > prec && !lj) {

+ while (fw > prec) {

+ bchunk(sp, 1);

+ fw--;

+ }

+ bchunk(pr_str, (int) prec);

+ if (fw > prec) {

+ while (fw > prec) {

+ bchunk(sp, 1);

+ fw--;

+ }

+ s0 = s1;

+ free_temp(arg);

+ break;

+ case 'd':

+ case 'i':

+ parse_next_arg();

+ val = (long) force_number(arg);

+ free_temp(arg);

+ if (val < 0) {

+ sgn = 1;

+ val = -val;

+ } else

+ sgn = 0;

+ do {

+ *--cp = '0' + val % 10;

+ val /= 10;

+ } while (val);

+ if (sgn)

+ *--cp = '-';

+ else if (signchar)

+ *--cp = signchar;

+ if (prec > fw)

+ fw = prec;

+ prec = cend - cp;

+ if (fw > prec && !lj) {

+ if (fill != sp && (*cp == '-' || signchar)) {

+ bchunk(cp, 1);

+ cp++;

+ prec--;

+ fw--;

+ }

+ while (fw > prec) {

+ bchunk(fill, 1);

+ fw--;

+ }

+ bchunk(cp, (int) prec);

+ if (fw > prec) {

+ while (fw > prec) {

+ bchunk(fill, 1);

+ fw--;

+ }

+ s0 = s1;

+ break;

+ case 'u':

+ base = 10;

+ goto pr_unsigned;

+ case 'o':

+ base = 8;

+ goto pr_unsigned;

+ case 'X':

+ ucasehex = 1;

+ case 'x':

+ base = 16;

+ goto pr_unsigned;

+ pr_unsigned:

+ parse_next_arg();

+ uval = (unsigned long) force_number(arg);

+ free_temp(arg);

+ do {

+ *--cp = chbuf[uval % base];

+ if (ucasehex && isalpha(*cp))

+ *cp = toupper(*cp);

+ uval /= base;

+ } while (uval);

+ if (alt && (base == 8 || base == 16)) {

+ if (base == 16) {

+ if (ucasehex)

+ *--cp = 'X';

+ else

+ *--cp = 'x';

+ }

+ *--cp = '0';

+ }

+ prec = cend - cp;

+ if (fw > prec && !lj) {

+ while (fw > prec) {

+ bchunk(fill, 1);

+ fw--;

+ }

+ bchunk(cp, (int) prec);

+ if (fw > prec) {

+ while (fw > prec) {

+ bchunk(fill, 1);

+ fw--;

+ }

+ s0 = s1;

+ break;

+ case 'g':

+ parse_next_arg();

+ tmpval = force_number(arg);

+ free_temp(arg);

+ chksize(fw + prec + 9); /* 9==slop */

+ cp = cpbuf;

+ *cp++ = '%';

+ if (lj)

+ *cp++ = '-';

+ if (fill != sp)

+ *cp++ = '0';

+#ifndef GFMT_WORKAROUND

+ if (cur != &fw) {

+ (void) strcpy(cp, "*.*g");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval);

+ } else {

+ (void) strcpy(cp, "*g");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval);

+ }

+#else /* GFMT_WORKAROUND */

+ {

+ char *gptr, gbuf[120];

+#define DEFAULT_G_PRECISION 6

+ if (fw + prec + 9 > sizeof gbuf) { /* 9==slop */

+ emalloc(gptr, char *, fw+prec+9, "do_sprintf(gfmt)");

+ } else

+ gptr = gbuf;

+ (void) gfmt((double) tmpval, cur != &fw ?

+ (int) prec : DEFAULT_G_PRECISION, gptr);

+ *cp++ = '*', *cp++ = 's', *cp = '\0';

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, gptr);

+ if (fill != sp && *gptr == ' ') {

+ char *p = gptr;

+ do { *p++ = '0'; } while (*p == ' ');

+ }

+ if (gptr != gbuf) free(gptr);

+ }

+#endif /* GFMT_WORKAROUND */

+ len = strlen(obuf + olen);

+ ofre -= len;

+ olen += len;

+ s0 = s1;

+ break;

+ case 'f':

+ parse_next_arg();

+ tmpval = force_number(arg);

+ free_temp(arg);

+ chksize(fw + prec + 9); /* 9==slop */

+ cp = cpbuf;

+ *cp++ = '%';

+ if (lj)

+ *cp++ = '-';

+ if (fill != sp)

+ *cp++ = '0';

+ if (cur != &fw) {

+ (void) strcpy(cp, "*.*f");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval);

+ } else {

+ (void) strcpy(cp, "*f");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval);

+ }

+ len = strlen(obuf + olen);

+ ofre -= len;

+ olen += len;

+ s0 = s1;

+ break;

+ case 'e':

+ parse_next_arg();

+ tmpval = force_number(arg);

+ free_temp(arg);

+ chksize(fw + prec + 9); /* 9==slop */

+ cp = cpbuf;

+ *cp++ = '%';

+ if (lj)

+ *cp++ = '-';

+ if (fill != sp)

+ *cp++ = '0';

+ if (cur != &fw) {

+ (void) strcpy(cp, "*.*e");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval);

+ } else {

+ (void) strcpy(cp, "*e");

+ (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval);

+ }

+ len = strlen(obuf + olen);

+ ofre -= len;

+ olen += len;

+ s0 = s1;

+ break;

+ default:

+ lose:

+ break;

+ }

+ if (toofew)

+ fatal("%s\n\t%s\n\t%*s%s",

+ "not enough arguments to satisfy format string",

+ sfmt->stptr, s1 - sfmt->stptr - 2, "",

+ "^ ran out for this one"

+ );

+ }

+ if (do_lint && carg != NULL)

+ warning("too many arguments supplied for format string");

+ bchunk(s0, s1 - s0);

+ free_temp(sfmt);

+ r = make_str_node(obuf, olen, ALREADY_MALLOCED);

+ r->flags |= TEMP;

+ return r;

+void

+do_printf(tree)

+register NODE *tree;

+ struct redirect *rp = NULL;

+ register FILE *fp;

+ if (tree->rnode) {

+ int errflg; /* not used, sigh */

+ rp = redirect(tree->rnode, &errflg);

+ if (rp) {

+ fp = rp->fp;

+ if (!fp)

+ return;

+ } else

+ return;

+ } else

+ fp = stdout;

+ tree = do_sprintf(tree->lnode);

+ efwrite(tree->stptr, sizeof(char), tree->stlen, fp, "printf", rp , 1);

+ free_temp(tree);

+NODE *

+do_sqrt(tree)

+NODE *tree;

+ NODE *tmp;

+ double arg;

+ extern double sqrt P((double));

+ tmp = tree_eval(tree->lnode);

+ arg = (double) force_number(tmp);

+ free_temp(tmp);

+ if (arg < 0.0)

+ warning("sqrt called with negative argument %g", arg);

+ return tmp_number((AWKNUM) sqrt(arg));

+NODE *

+do_substr(tree)

+NODE *tree;

+ NODE *t1, *t2, *t3;

+ NODE *r;

+ register int indx;

+ size_t length;

+ t1 = tree_eval(tree->lnode);

+ t2 = tree_eval(tree->rnode->lnode);

+ if (tree->rnode->rnode == NULL) /* third arg. missing */

+ length = t1->stlen;

+ else {

+ t3 = tree_eval(tree->rnode->rnode->lnode);

+ length = (size_t) force_number(t3);

+ free_temp(t3);

+ }

+ indx = (int) force_number(t2) - 1;

+ free_temp(t2);

+ t1 = force_string(t1);

+ if (indx < 0)

+ indx = 0;

+ if (indx >= t1->stlen || length <= 0) {

+ free_temp(t1);

+ return Nnull_string;

+ }

+ if (indx + length > t1->stlen || LONG_MAX - indx < length)

+ length = t1->stlen - indx;

+ r = tmp_string(t1->stptr + indx, length);

+ free_temp(t1);

+ return r;

+NODE *

+do_strftime(tree)

+NODE *tree;

+ NODE *t1, *t2;

+ struct tm *tm;

+ time_t fclock;

+ char buf[100];

+ int ret;

+ t1 = force_string(tree_eval(tree->lnode));

+ if (tree->rnode == NULL) /* second arg. missing, default */

+ (void) time(&fclock);

+ else {

+ t2 = tree_eval(tree->rnode->lnode);

+ fclock = (time_t) force_number(t2);

+ free_temp(t2);

+ }

+ tm = localtime(&fclock);

+ ret = strftime(buf, 100, t1->stptr, tm);

+ return tmp_string(buf, ret);

+NODE *

+do_systime(tree)

+NODE *tree;

+ time_t lclock;

+ (void) time(&lclock);

+ return tmp_number((AWKNUM) lclock);

+NODE *

+do_system(tree)

+NODE *tree;

+ NODE *tmp;

+ int ret = 0;

+ char *cmd;

+ (void) flush_io (); /* so output is synchronous with gawk's */

+ tmp = tree_eval(tree->lnode);

+ cmd = force_string(tmp)->stptr;

+ if (cmd && *cmd) {

+ ret = system(cmd);

+ ret = (ret >> 8) & 0xff;

+ }

+ free_temp(tmp);

+ return tmp_number((AWKNUM) ret);

+void

+do_print(tree)

+register NODE *tree;

+ register NODE *t1;

+ struct redirect *rp = NULL;

+ register FILE *fp;

+ register char *s;

+ if (tree->rnode) {

+ int errflg; /* not used, sigh */

+ rp = redirect(tree->rnode, &errflg);

+ if (rp) {

+ fp = rp->fp;

+ if (!fp)

+ return;

+ } else

+ return;

+ } else

+ fp = stdout;

+ tree = tree->lnode;

+ while (tree) {

+ t1 = tree_eval(tree->lnode);

+ if (t1->flags & NUMBER) {

+ if (OFMTidx == CONVFMTidx)

+ (void) force_string(t1);

+ else {

+ char buf[100];

+ sprintf(buf, OFMT, t1->numbr);

+ t1 = tmp_string(buf, strlen(buf));

+ }

+ efwrite(t1->stptr, sizeof(char), t1->stlen, fp, "print", rp, 0);

+ free_temp(t1);

+ tree = tree->rnode;

+ if (tree) {

+ s = OFS;

+ if (OFSlen)

+ efwrite(s, sizeof(char), OFSlen, fp, "print", rp, 0);

+ }

+ s = ORS;

+ if (ORSlen)

+ efwrite(s, sizeof(char), ORSlen, fp, "print", rp, 1);

+NODE *

+do_tolower(tree)

+NODE *tree;

+ NODE *t1, *t2;

+ register char *cp, *cp2;

+ t1 = tree_eval(tree->lnode);

+ t1 = force_string(t1);

+ t2 = tmp_string(t1->stptr, t1->stlen);

+ for (cp = t2->stptr, cp2 = t2->stptr + t2->stlen; cp < cp2; cp++)

+ if (isupper(*cp))

+ *cp = tolower(*cp);

+ free_temp(t1);

+ return t2;

+NODE *

+do_toupper(tree)

+NODE *tree;

+ NODE *t1, *t2;

+ register char *cp;

+ t1 = tree_eval(tree->lnode);

+ t1 = force_string(t1);

+ t2 = tmp_string(t1->stptr, t1->stlen);

+ for (cp = t2->stptr; cp < t2->stptr + t2->stlen; cp++)

+ if (islower(*cp))

+ *cp = toupper(*cp);

+ free_temp(t1);

+ return t2;

+NODE *

+do_atan2(tree)

+NODE *tree;

+ NODE *t1, *t2;

+ extern double atan2 P((double, double));

+ double d1, d2;

+ t1 = tree_eval(tree->lnode);

+ t2 = tree_eval(tree->rnode->lnode);

+ d1 = force_number(t1);

+ d2 = force_number(t2);

+ free_temp(t1);

+ free_temp(t2);

+ return tmp_number((AWKNUM) atan2(d1, d2));

+NODE *

+do_sin(tree)

+NODE *tree;

+ NODE *tmp;

+ extern double sin P((double));

+ double d;

+ tmp = tree_eval(tree->lnode);

+ d = sin((double)force_number(tmp));

+ free_temp(tmp);

+ return tmp_number((AWKNUM) d);

+NODE *

+do_cos(tree)

+NODE *tree;

+ NODE *tmp;

+ extern double cos P((double));

+ double d;

+ tmp = tree_eval(tree->lnode);

+ d = cos((double)force_number(tmp));

+ free_temp(tmp);

+ return tmp_number((AWKNUM) d);

+static int firstrand = 1;

+static char state[256];

+/* ARGSUSED */

+NODE *

+do_rand(tree)

+NODE *tree;

+ if (firstrand) {

+ (void) initstate((unsigned) 1, state, sizeof state);

+ srandom(1);

+ firstrand = 0;

+ }

+ return tmp_number((AWKNUM) random() / LONG_MAX);

+NODE *

+do_srand(tree)

+NODE *tree;

+ NODE *tmp;

+ static long save_seed = 0;

+ long ret = save_seed; /* SVR4 awk srand returns previous seed */

+ if (firstrand)

+ (void) initstate((unsigned) 1, state, sizeof state);

+ else

+ (void) setstate(state);

+ if (!tree)

+ srandom((int) (save_seed = (long) time((time_t *) 0)));

+ else {

+ tmp = tree_eval(tree->lnode);

+ srandom((int) (save_seed = (long) force_number(tmp)));

+ free_temp(tmp);

+ }

+ firstrand = 0;

+ return tmp_number((AWKNUM) ret);

+NODE *

+do_match(tree)

+NODE *tree;

+ NODE *t1;

+ int rstart;

+ AWKNUM rlength;

+ Regexp *rp;

+ t1 = force_string(tree_eval(tree->lnode));

+ tree = tree->rnode->lnode;

+ rp = re_update(tree);

+ rstart = research(rp, t1->stptr, 0, t1->stlen, 1);

+ if (rstart >= 0) { /* match succeded */

+ rstart++; /* 1-based indexing */

+ rlength = REEND(rp, t1->stptr) - RESTART(rp, t1->stptr);

+ } else { /* match failed */

+ rstart = 0;

+ rlength = -1.0;

+ }

+ free_temp(t1);

+ unref(RSTART_node->var_value);

+ RSTART_node->var_value = make_number((AWKNUM) rstart);

+ unref(RLENGTH_node->var_value);

+ RLENGTH_node->var_value = make_number(rlength);

+ return tmp_number((AWKNUM) rstart);

+static NODE *

+sub_common(tree, global)

+NODE *tree;

+int global;

+ register char *scan;

+ register char *bp, *cp;

+ char *buf;

+ int buflen;

+ register char *matchend;

+ register int len;

+ char *matchstart;

+ char *text;

+ int textlen;

+ char *repl;

+ char *replend;

+ int repllen;

+ int sofar;

+ int ampersands;

+ int matches = 0;

+ Regexp *rp;

+ NODE *s; /* subst. pattern */

+ NODE *t; /* string to make sub. in; $0 if none given */

+ NODE *tmp;

+ NODE **lhs = &tree; /* value not used -- just different from NULL */

+ int priv = 0;

+ Func_ptr after_assign = NULL;

+ tmp = tree->lnode;

+ rp = re_update(tmp);

+ tree = tree->rnode;

+ s = tree->lnode;

+ tree = tree->rnode;

+ tmp = tree->lnode;

+ t = force_string(tree_eval(tmp));

+ /* do the search early to avoid work on non-match */

+ if (research(rp, t->stptr, 0, t->stlen, 1) == -1 ||

+ (RESTART(rp, t->stptr) > t->stlen) && (matches = 1)) {

+ free_temp(t);

+ return tmp_number((AWKNUM) matches);

+ }

+ if (tmp->type == Node_val)

+ lhs = NULL;

+ else

+ lhs = get_lhs(tmp, &after_assign);

+ t->flags |= STRING;

+ /*

+ * create a private copy of the string

+ */

+ if (t->stref > 1 || (t->flags & PERM)) {

+ unsigned int saveflags;

+ saveflags = t->flags;

+ t->flags &= ~MALLOC;

+ tmp = dupnode(t);

+ t->flags = saveflags;

+ t = tmp;

+ priv = 1;

+ }

+ text = t->stptr;

+ textlen = t->stlen;

+ buflen = textlen + 2;

+ s = force_string(tree_eval(s));

+ repl = s->stptr;

+ replend = repl + s->stlen;

+ repllen = replend - repl;

+ emalloc(buf, char *, buflen, "do_sub");

+ ampersands = 0;

+ for (scan = repl; scan < replend; scan++) {

+ if (*scan == '&') {

+ repllen--;

+ ampersands++;

+ } else if (*scan == '\\' && (*(scan+1) == '&' || *(scan+1) == '\\')) {

+ repllen--;

+ scan++;

+ }

+ bp = buf;

+ for (;;) {

+ matches++;

+ matchstart = t->stptr + RESTART(rp, t->stptr);

+ matchend = t->stptr + REEND(rp, t->stptr);

+ /*

+ * create the result, copying in parts of the original

+ * string

+ */

+ len = matchstart - text + repllen

+ + ampersands * (matchend - matchstart);

+ sofar = bp - buf;

+ while (buflen - sofar - len - 1 < 0) {

+ buflen *= 2;

+ erealloc(buf, char *, buflen, "do_sub");

+ bp = buf + sofar;

+ }

+ for (scan = text; scan < matchstart; scan++)

+ *bp++ = *scan;

+ for (scan = repl; scan < replend; scan++)

+ if (*scan == '&')

+ for (cp = matchstart; cp < matchend; cp++)

+ *bp++ = *cp;

+ else if (*scan == '\\' && (*(scan+1) == '&' || *(scan+1) == '\\')) {

+ scan++;

+ *bp++ = *scan;

+ } else

+ *bp++ = *scan;

+ if (global && matchstart == matchend && matchend < text + textlen) {

+ *bp++ = *matchend;

+ matchend++;

+ }

+ textlen = text + textlen - matchend;

+ text = matchend;

+ if (!global || textlen <= 0 ||

+ research(rp, t->stptr, text-t->stptr, textlen, 1) == -1)

+ break;

+ }

+ sofar = bp - buf;

+ if (buflen - sofar - textlen - 1) {

+ buflen = sofar + textlen + 2;

+ erealloc(buf, char *, buflen, "do_sub");

+ bp = buf + sofar;

+ }

+ for (scan = matchend; scan < text + textlen; scan++)

+ *bp++ = *scan;

+ textlen = bp - buf;

+ free(t->stptr);

+ t->stptr = buf;

+ t->stlen = textlen;

+ free_temp(s);

+ if (matches > 0 && lhs) {

+ if (priv) {

+ unref(*lhs);

+ *lhs = t;

+ }

+ if (after_assign)

+ (*after_assign)();

+ t->flags &= ~(NUM|NUMBER);

+ }

+ return tmp_number((AWKNUM) matches);

+NODE *

+do_gsub(tree)

+NODE *tree;

+ return sub_common(tree, 1);

+NODE *

+do_sub(tree)

+NODE *tree;

+ return sub_common(tree, 0);

+#ifdef GFMT_WORKAROUND

+ /*

+ * printf's %g format [can't rely on gcvt()]

+ * caveat: don't use as argument to *printf()!

+ */

+char *

+gfmt(g, prec, buf)

+double g; /* value to format */

+int prec; /* indicates desired significant digits, not decimal places */

+char *buf; /* return buffer; assumed big enough to hold result */

+ if (g == 0.0) {

+ (void) strcpy(buf, "0"); /* easy special case */

+ } else {

+ register char *d, *e, *p;

+ /* start with 'e' format (it'll provide nice exponent) */

+ if (prec < 1) prec = 1; /* at least 1 significant digit */

+ (void) sprintf(buf, "%.*e", prec - 1, g);

+ if ((e = strchr(buf, 'e')) != 0) { /* find exponent */

+ int exp = atoi(e+1); /* fetch exponent */

+ if (exp >= -4 && exp < prec) { /* per K&R2, B1.2 */

+ /* switch to 'f' format and re-do */

+ prec -= (exp + 1); /* decimal precision */

+ (void) sprintf(buf, "%.*f", prec, g);

+ e = buf + strlen(buf);

+ }

+ if ((d = strchr(buf, '.')) != 0) {

+ /* remove trailing zeroes and decimal point */

+ for (p = e; p > d && *--p == '0'; ) continue;

+ if (*p == '.') --p;

+ if (++p < e) /* copy exponent and NUL */

+ while ((*p++ = *e++) != '\0') continue;

+ }

+ return buf;

+#endif /* GFMT_WORKAROUND */

diff --git a/gnu/usr.bin/awk/config.h b/gnu/usr.bin/awk/config.h
new file mode 100644
index 000000000000..8c20953ed531
--- /dev/null
+++ b/gnu/usr.bin/awk/config.h

@@ -0,0 +1,272 @@

+/*

+ * config.h -- configuration definitions for gawk.

+ *

+ * For generic 4.4 alpha

+ */

+/*

+ *

+ * This file is part of GAWK, the GNU implementation of the

+ * AWK Progamming Language.

+ *

+ * GAWK is free software; you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation; either version 2, or (at your option)

+ * any later version.

+ *

+ * GAWK is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GAWK; see the file COPYING. If not, write to

+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

+ */

+/*

+ * This file isolates configuration dependencies for gnu awk.

+ * You should know something about your system, perhaps by having

+ * a manual handy, when you edit this file. You should copy config.h-dist

+ * to config.h, and edit config.h. Do not modify config.h-dist, so that

+ * it will be easy to apply any patches that may be distributed.

+ *

+ * The general idea is that systems conforming to the various standards

+ * should need to do the least amount of changing. Definining the various

+ * items in ths file usually means that your system is missing that

+ * particular feature.

+ *

+ * The order of preference in standard conformance is ANSI C, POSIX,

+ * and the SVID.

+ *

+ * If you have no clue as to what's going on with your system, try

+ * compiling gawk without editing this file and see what shows up

+ * missing in the link stage. From there, you can probably figure out

+ * which defines to turn on.

+ */

+/**************************/

+/* Miscellanious features */

+/**************************/

+/*

+ * BLKSIZE_MISSING

+ *

+ * Check your /usr/include/sys/stat.h file. If the stat structure

+ * does not have a member named st_blksize, define this. (This will

+ * most likely be the case on most System V systems prior to V.4.)

+ */

+/* #define BLKSIZE_MISSING 1 */

+/*

+ * SIGTYPE

+ *

+ * The return type of the routines passed to the signal function.

+ * Modern systems use `void', older systems use `int'.

+ * If left undefined, it will default to void.

+ */

+/* #define SIGTYPE int */

+/*

+ * SIZE_T_MISSING

+ *

+ * If your system has no typedef for size_t, define this to get a default

+ */

+/* #define SIZE_T_MISSING 1 */

+/*

+ * CHAR_UNSIGNED

+ *

+ * If your machine uses unsigned characters (IBM RT and RS/6000 and others)

+ * then define this for use in regex.c

+ */

+/* #define CHAR_UNSIGNED 1 */

+/*

+ * HAVE_UNDERSCORE_SETJMP

+ *

+ * Check in your /usr/include/setjmp.h file. If there are routines

+ * there named _setjmp and _longjmp, then you should define this.

+ * Typically only systems derived from Berkeley Unix have this.

+ */

+#define HAVE_UNDERSCORE_SETJMP 1

+/***********************************************/

+/* Missing library subroutines or system calls */

+/***********************************************/

+/*

+ * MEMCMP_MISSING

+ * MEMCPY_MISSING

+ * MEMSET_MISSING

+ *

+ * These three routines are for manipulating blocks of memory. Most

+ * likely they will either all three be present or all three be missing,

+ * so they're grouped together.

+ */

+/* #define MEMCMP_MISSING 1 */

+/* #define MEMCPY_MISSING 1 */

+/* #define MEMSET_MISSING 1 */

+/*

+ * RANDOM_MISSING

+ *

+ * Your system does not have the random(3) suite of random number

+ * generating routines. These are different than the old rand(3)

+ * routines!

+ */

+/* #define RANDOM_MISSING 1 */

+/*

+ * STRCASE_MISSING

+ *

+ * Your system does not have the strcasemp() and strncasecmp()

+ * routines that originated in Berkeley Unix.

+ */

+/* #define STRCASE_MISSING 1 */

+/*

+ * STRCHR_MISSING

+ *

+ * Your system does not have the strchr() and strrchr() functions.

+ */

+/* #define STRCHR_MISSING 1 */

+/*

+ * STRERROR_MISSING

+ *

+ * Your system lacks the ANSI C strerror() routine for returning the

+ * strings associated with errno values.

+ */

+/* #define STRERROR_MISSING 1 */

+/*

+ * STRTOD_MISSING

+ *

+ * Your system does not have the strtod() routine for converting

+ * strings to double precision floating point values.

+ */

+/* #define STRTOD_MISSING 1 */

+/*

+ * STRFTIME_MISSING

+ *

+ * Your system lacks the ANSI C strftime() routine for formatting

+ * broken down time values.

+ */

+/* #define STRFTIME_MISSING 1 */

+/*

+ * TZSET_MISSING

+ *

+ * If you have a 4.2 BSD vintage system, then the strftime() routine

+ * supplied in the missing directory won't be enough, because it relies on the

+ * tzset() routine from System V / Posix. Fortunately, there is an

+ * emulation for tzset() too that should do the trick. If you don't

+ * have tzset(), define this.

+ */

+/* #define TZSET_MISSING 1 */

+/*

+ * TZNAME_MISSING

+ *

+ * Some systems do not support the external variables tzname and daylight.

+ * If this is the case *and* strftime() is missing, define this.

+ */

+/* #define TZNAME_MISSING 1 */

+/*

+ * STDC_HEADERS

+ *

+ * If your system does have ANSI compliant header files that

+ * provide prototypes for library routines, then define this.

+ */

+#define STDC_HEADERS 1

+/*

+ * NO_TOKEN_PASTING

+ *

+ * If your compiler define's __STDC__ but does not support token

+ * pasting (tok##tok), then define this.

+ */

+/* #define NO_TOKEN_PASTING 1 */

+/*****************************************************************/

+/* Stuff related to the Standard I/O Library. */

+/*****************************************************************/

+/* Much of this is (still, unfortunately) black magic in nature. */

+/* You may have to use some or all of these together to get gawk */

+/* to work correctly. */

+/*****************************************************************/

+/*

+ * NON_STD_SPRINTF

+ *

+ * Look in your /usr/include/stdio.h file. If the return type of the

+ * sprintf() function is NOT `int', define this.

+ */

+/* #define NON_STD_SPRINTF 1 */

+/*

+ * VPRINTF_MISSING

+ *

+ * Define this if your system lacks vprintf() and the other routines

+ * that go with it. This will trigger an attempt to use _doprnt().

+ * If you don't have that, this attempt will fail and you are on your own.

+ */

+/* #define VPRINTF_MISSING 1 */

+/*

+ * Casts from size_t to int and back. These will become unnecessary

+ * at some point in the future, but for now are required where the

+ * two types are a different representation.

+ */

+/* #define SZTC */

+/* #define INTC */

+/*

+ * SYSTEM_MISSING

+ *

+ * Define this if your library does not provide a system function

+ * or you are not entirely happy with it and would rather use

+ * a provided replacement (atari only).

+ */

+/* #define SYSTEM_MISSING 1 */

+/*

+ * FMOD_MISSING

+ *

+ * Define this if your system lacks the fmod() function and modf() will

+ * be used instead.

+ */

+/* #define FMOD_MISSING 1 */

+/*******************************/

+/* Gawk configuration options. */

+/*******************************/

+/*

+ * DEFPATH

+ *

+ * The default search path for the -f option of gawk. It is used

+ * if the AWKPATH environment variable is undefined. The default

+ * definition is provided here. Most likely you should not change

+ * this.

+ */

+/* #define DEFPATH ".:/usr/lib/awk:/usr/local/lib/awk" */

+/* #define ENVSEP ':' */

+/*

+ * alloca already has a prototype defined - don't redefine it

+ */

+#define ALLOCA_PROTO 1

+/*

+ * srandom already has a prototype defined - don't redefine it

+ */

+#define SRANDOM_PROTO 1

+/* anything that follows is for system-specific short-term kludges */

diff --git a/gnu/usr.bin/awk/dfa.c b/gnu/usr.bin/awk/dfa.c
new file mode 100644
index 000000000000..5293c755871d
--- /dev/null
+++ b/gnu/usr.bin/awk/dfa.c

@@ -0,0 +1,2291 @@

+/* dfa.c - determinisitic extended regexp routines for GNU

+ Written June, 1988 by Mike Haertel

+ Modified July, 1988 by Arthur David Olson

+ to assist BMG speedups

+ NO WARRANTY

+ BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY

+NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT

+WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,

+RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"

+WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,

+BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

+FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY

+AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE

+DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR

+CORRECTION.

+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.

+STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY

+WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE

+LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR

+OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE

+USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR

+DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR

+A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS

+PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH

+DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.

+ GENERAL PUBLIC LICENSE TO COPY

+ 1. You may copy and distribute verbatim copies of this source file

+as you receive it, in any medium, provided that you conspicuously and

+appropriately publish on each copy a valid copyright notice "Copyright

+copyright notice a verbatim copy of the above disclaimer of warranty

+and of this License. You may charge a distribution fee for the

+physical act of transferring a copy.

+ 2. You may modify your copy or copies of this source file or

+any portion of it, and copy and distribute such modifications under

+the terms of Paragraph 1 above, provided that you also do the following:

+ a) cause the modified files to carry prominent notices stating

+ that you changed the files and the date of any change; and

+ b) cause the whole of any work that you distribute or publish,

+ that in whole or in part contains or is a derivative of this

+ program or any part thereof, to be licensed at no charge to all

+ third parties on terms identical to those contained in this

+ License Agreement (except that you may choose to grant more extensive

+ warranty protection to some or all third parties, at your option).

+ c) You may charge a distribution fee for the physical act of

+ transferring a copy, and you may at your option offer warranty

+ protection in exchange for a fee.

+Mere aggregation of another unrelated program with this program (or its

+derivative) on a volume of a storage or distribution medium does not bring

+the other program under the scope of these terms.

+ 3. You may copy and distribute this program or any portion of it in

+compiled, executable or object code form under the terms of Paragraphs

+1 and 2 above provided that you do the following:

+ a) accompany it with the complete corresponding machine-readable

+ source code, which must be distributed under the terms of

+ Paragraphs 1 and 2 above; or,

+ b) accompany it with a written offer, valid for at least three

+ years, to give any third party free (except for a nominal

+ shipping charge) a complete machine-readable copy of the

+ corresponding source code, to be distributed under the terms of

+ Paragraphs 1 and 2 above; or,

+ c) accompany it with the information you received as to where the

+ corresponding source code may be obtained. (This alternative is

+ allowed only for noncommercial distribution and only if you

+ received the program in object code or executable form alone.)

+For an executable file, complete source code means all the source code for

+all modules it contains; but, as a special exception, it need not include

+source code for modules which are standard libraries that accompany the

+operating system on which the executable file runs.

+ 4. You may not copy, sublicense, distribute or transfer this program

+except as expressly provided under this License Agreement. Any attempt

+otherwise to copy, sublicense, distribute or transfer this program is void and

+your rights to use the program under this License agreement shall be

+automatically terminated. However, parties who have received computer

+software programs from you with this License Agreement will not have

+their licenses terminated so long as such parties remain in full compliance.

+ 5. If you wish to incorporate parts of this program into other free

+programs whose distribution conditions are different, write to the Free

+Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet

+worked out a simple rule that can be stated here, but we will often permit

+this. We will be guided by the two goals of preserving the free status of

+all derivatives our free software and of promoting the sharing and reuse of

+software.

+In other words, you are welcome to use, share and improve this program.

+You are forbidden to forbid anyone else to use, share and improve

+what you give them. Help stamp out software-hoarding! */

+#include "awk.h"

+#include <assert.h>

+#ifdef setbit /* surprise - setbit and clrbit are macros on NeXT */

+#undef setbit

+#endif

+#ifdef clrbit

+#undef clrbit

+#endif

+#ifdef __STDC__

+typedef void *ptr_t;

+#else

+typedef char *ptr_t;

+#endif

+typedef struct {

+ char ** in;

+ char * left;

+ char * right;

+ char * is;

+} must;

+static ptr_t xcalloc P((int n, size_t s));

+static ptr_t xmalloc P((size_t n));

+static ptr_t xrealloc P((ptr_t p, size_t n));

+static int tstbit P((int b, _charset c));

+static void setbit P((int b, _charset c));

+static void clrbit P((int b, _charset c));

+static void copyset P((const _charset src, _charset dst));

+static void zeroset P((_charset s));

+static void notset P((_charset s));

+static int equal P((const _charset s1, const _charset s2));

+static int charset_index P((const _charset s));

+static _token lex P((void));

+static void addtok P((_token t));

+static void atom P((void));

+static void closure P((void));

+static void branch P((void));

+static void regexp P((void));

+static void copy P((const _position_set *src, _position_set *dst));

+static void insert P((_position p, _position_set *s));

+static void merge P((_position_set *s1, _position_set *s2, _position_set *m));

+static void delete P((_position p, _position_set *s));

+static int state_index P((struct regexp *r, _position_set *s,

+ int newline, int letter));

+static void epsclosure P((_position_set *s, struct regexp *r));

+static void build_state P((int s, struct regexp *r));

+static void build_state_zero P((struct regexp *r));

+static char *icatalloc P((char *old, const char *new));

+static char *icpyalloc P((const char *string));

+static char *istrstr P((char *lookin, char *lookfor));

+static void ifree P((char *cp));

+static void freelist P((char **cpp));

+static char **enlist P((char **cpp, char *new, size_t len));

+static char **comsubs P((char *left, char *right));

+static char **addlists P((char **old, char **new));

+static char **inboth P((char **left, char **right));

+static void resetmust P((must *mp));

+static void regmust P((struct regexp *r));

+#undef P

+static ptr_t

+xcalloc(n, s)

+ int n;

+ size_t s;

+ ptr_t r = calloc(n, s);

+ if (NULL == r)

+ reg_error("Memory exhausted"); /* reg_error does not return */

+ return r;

+static ptr_t

+xmalloc(n)

+ size_t n;

+ ptr_t r = malloc(n);

+ assert(n != 0);

+ if (NULL == r)

+ reg_error("Memory exhausted");

+ return r;

+static ptr_t

+xrealloc(p, n)

+ ptr_t p;

+ size_t n;

+ ptr_t r = realloc(p, n);

+ assert(n != 0);

+ if (NULL == r)

+ reg_error("Memory exhausted");

+ return r;

+#define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t)))

+#undef MALLOC

+#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))

+#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))

+/* Reallocate an array of type t if nalloc is too small for index. */

+#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \

+ if ((index) >= (nalloc)) \

+ { \

+ while ((index) >= (nalloc)) \

+ (nalloc) *= 2; \

+ REALLOC(p, t, nalloc); \

+ }

+/* Stuff pertaining to charsets. */

+static int

+tstbit(b, c)

+ int b;

+ _charset c;

+ return c[b / INTBITS] & 1 << b % INTBITS;

+static void

+setbit(b, c)

+ int b;

+ _charset c;

+ c[b / INTBITS] |= 1 << b % INTBITS;

+static void

+clrbit(b, c)

+ int b;

+ _charset c;

+ c[b / INTBITS] &= ~(1 << b % INTBITS);

+static void

+copyset(src, dst)

+ const _charset src;

+ _charset dst;

+ int i;

+ for (i = 0; i < _CHARSET_INTS; ++i)

+ dst[i] = src[i];

+static void

+zeroset(s)

+ _charset s;

+ int i;

+ for (i = 0; i < _CHARSET_INTS; ++i)

+ s[i] = 0;

+static void

+notset(s)

+ _charset s;

+ int i;

+ for (i = 0; i < _CHARSET_INTS; ++i)

+ s[i] = ~s[i];

+static int

+equal(s1, s2)

+ const _charset s1;

+ const _charset s2;

+ int i;

+ for (i = 0; i < _CHARSET_INTS; ++i)

+ if (s1[i] != s2[i])

+ return 0;

+ return 1;

+/* A pointer to the current regexp is kept here during parsing. */

+static struct regexp *reg;

+/* Find the index of charset s in reg->charsets, or allocate a new charset. */

+static int

+charset_index(s)

+ const _charset s;

+ int i;

+ for (i = 0; i < reg->cindex; ++i)

+ if (equal(s, reg->charsets[i]))

+ return i;

+ REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex);

+ ++reg->cindex;

+ copyset(s, reg->charsets[i]);

+ return i;

+/* Syntax bits controlling the behavior of the lexical analyzer. */

+static syntax_bits, syntax_bits_set;

+/* Flag for case-folding letters into sets. */

+static case_fold;

+/* Entry point to set syntax options. */

+void

+regsyntax(bits, fold)

+ long bits;

+ int fold;

+ syntax_bits_set = 1;

+ syntax_bits = bits;

+ case_fold = fold;

+/* Lexical analyzer. */

+static const char *lexstart; /* Pointer to beginning of input string. */

+static const char *lexptr; /* Pointer to next input character. */

+static lexleft; /* Number of characters remaining. */

+static caret_allowed; /* True if backward context allows ^

+ (meaningful only if RE_CONTEXT_INDEP_OPS

+ is turned off). */

+static closure_allowed; /* True if backward context allows closures

+ (meaningful only if RE_CONTEXT_INDEP_OPS

+ is turned off). */

+/* Note that characters become unsigned here. */

+#define FETCH(c, eoferr) \

+ { \

+ if (! lexleft) \

+ if (eoferr != NULL) \

+ reg_error(eoferr); \

+ else \

+ return _END; \

+ (c) = (unsigned char) *lexptr++; \

+ --lexleft; \

+ }

+static _token

+lex()

+ _token c, c2;

+ int invert;

+ _charset cset;

+ FETCH(c, (char *) 0);

+ switch (c)

+ {

+ case '^':

+ if (! (syntax_bits & RE_CONTEXT_INDEP_OPS)

+ && (!caret_allowed ||

+ ((syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart)))

+ goto normal_char;

+ caret_allowed = 0;

+ return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE;

+ case '$':

+ if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft

+ || (! (syntax_bits & RE_TIGHT_VBAR)

+ && ((syntax_bits & RE_NO_BK_PARENS

+ ? lexleft > 0 && *lexptr == ')'

+ : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')')

+ || (syntax_bits & RE_NO_BK_VBAR

+ ? lexleft > 0 && *lexptr == '|'

+ : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|'))))

+ return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE;

+ goto normal_char;

+ case '\\':

+ FETCH(c, "Unfinished \\ quote");

+ switch (c)

+ {

+ case '1':

+ case '2':

+ case '3':

+ case '4':

+ case '5':

+ case '6':

+ case '7':

+ case '8':

+ case '9':

+ caret_allowed = 0;

+ closure_allowed = 1;

+ return _BACKREF;

+ case '<':

+ caret_allowed = 0;

+ return _BEGWORD;

+ case '>':

+ caret_allowed = 0;

+ return _ENDWORD;

+ case 'b':

+ caret_allowed = 0;

+ return _LIMWORD;

+ case 'B':

+ caret_allowed = 0;

+ return _NOTLIMWORD;

+ case 'w':

+ case 'W':

+ zeroset(cset);

+ for (c2 = 0; c2 < _NOTCHAR; ++c2)

+ if (ISALNUM(c2))

+ setbit(c2, cset);

+ if (c == 'W')

+ notset(cset);

+ caret_allowed = 0;

+ closure_allowed = 1;

+ return _SET + charset_index(cset);

+ case '?':

+ if (syntax_bits & RE_BK_PLUS_QM)

+ goto qmark;

+ goto normal_char;

+ case '+':

+ if (syntax_bits & RE_BK_PLUS_QM)

+ goto plus;

+ goto normal_char;

+ case '|':

+ if (! (syntax_bits & RE_NO_BK_VBAR))

+ goto or;

+ goto normal_char;

+ case '(':

+ if (! (syntax_bits & RE_NO_BK_PARENS))

+ goto lparen;

+ goto normal_char;

+ case ')':

+ if (! (syntax_bits & RE_NO_BK_PARENS))

+ goto rparen;

+ goto normal_char;

+ default:

+ goto normal_char;

+ }

+ case '?':

+ if (syntax_bits & RE_BK_PLUS_QM)

+ goto normal_char;

+ qmark:

+ if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)

+ goto normal_char;

+ return _QMARK;

+ case '*':

+ if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)

+ goto normal_char;

+ return _STAR;

+ case '+':

+ if (syntax_bits & RE_BK_PLUS_QM)

+ goto normal_char;

+ plus:

+ if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)

+ goto normal_char;

+ return _PLUS;

+ case '|':

+ if (! (syntax_bits & RE_NO_BK_VBAR))

+ goto normal_char;

+ or:

+ caret_allowed = 1;

+ closure_allowed = 0;

+ return _OR;

+ case '\n':

+ if (! (syntax_bits & RE_NEWLINE_OR))

+ goto normal_char;

+ goto or;

+ case '(':

+ if (! (syntax_bits & RE_NO_BK_PARENS))

+ goto normal_char;

+ lparen:

+ caret_allowed = 1;

+ closure_allowed = 0;

+ return _LPAREN;

+ case ')':

+ if (! (syntax_bits & RE_NO_BK_PARENS))

+ goto normal_char;

+ rparen:

+ caret_allowed = 0;

+ closure_allowed = 1;

+ return _RPAREN;

+ case '.':

+ zeroset(cset);

+ notset(cset);

+ clrbit('\n', cset);

+ caret_allowed = 0;

+ closure_allowed = 1;

+ return _SET + charset_index(cset);

+ case '[':

+ zeroset(cset);

+ FETCH(c, "Unbalanced [");

+ if (c == '^')

+ {

+ FETCH(c, "Unbalanced [");

+ invert = 1;

+ }

+ else

+ invert = 0;

+ do

+ {

+ FETCH(c2, "Unbalanced [");

+ if ((syntax_bits & RE_AWK_CLASS_HACK) && c == '\\')

+ {

+ c = c2;

+ FETCH(c2, "Unbalanced [");

+ }

+ if (c2 == '-')

+ {

+ FETCH(c2, "Unbalanced [");

+ if (c2 == ']' && (syntax_bits & RE_AWK_CLASS_HACK))

+ {

+ setbit(c, cset);

+ setbit('-', cset);

+ break;

+ }

+ while (c <= c2)

+ setbit(c++, cset);

+ FETCH(c, "Unbalanced [");

+ }

+ else

+ {

+ setbit(c, cset);

+ c = c2;

+ }

+ while (c != ']');

+ if (invert)

+ notset(cset);

+ caret_allowed = 0;

+ closure_allowed = 1;

+ return _SET + charset_index(cset);

+ default:

+ normal_char:

+ caret_allowed = 0;

+ closure_allowed = 1;

+ if (case_fold && ISALPHA(c))

+ {

+ zeroset(cset);

+ if (isupper(c))

+ c = tolower(c);

+ setbit(c, cset);

+ setbit(toupper(c), cset);

+ return _SET + charset_index(cset);

+ }

+ return c;

+ }

+/* Recursive descent parser for regular expressions. */

+static _token tok; /* Lookahead token. */

+static depth; /* Current depth of a hypothetical stack

+ holding deferred productions. This is

+ used to determine the depth that will be

+ required of the real stack later on in

+ reganalyze(). */

+/* Add the given token to the parse tree, maintaining the depth count and

+ updating the maximum depth if necessary. */

+static void

+addtok(t)

+ _token t;

+ REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex);

+ reg->tokens[reg->tindex++] = t;

+ switch (t)

+ {

+ case _QMARK:

+ case _STAR:

+ case _PLUS:

+ break;

+ case _CAT:

+ case _OR:

+ --depth;

+ break;

+ default:

+ ++reg->nleaves;

+ case _EMPTY:

+ ++depth;

+ break;

+ }

+ if (depth > reg->depth)

+ reg->depth = depth;

+/* The grammar understood by the parser is as follows.

+ start:

+ regexp

+ _ALLBEGLINE regexp

+ regexp _ALLENDLINE

+ _ALLBEGLINE regexp _ALLENDLINE

+ regexp:

+ regexp _OR branch

+ branch

+ branch:

+ branch closure

+ closure

+ closure:

+ closure _QMARK

+ closure _STAR

+ closure _PLUS

+ atom

+ atom:

+ <normal character>

+ _SET

+ _BACKREF

+ _BEGLINE

+ _ENDLINE

+ _BEGWORD

+ _ENDWORD

+ _LIMWORD

+ _NOTLIMWORD

+ <empty>

+ The parser builds a parse tree in postfix form in an array of tokens. */

+#ifdef __STDC__

+static void regexp(void);

+#else

+static void regexp();

+#endif

+static void

+atom()

+ if (tok >= 0 && (tok < _NOTCHAR || tok >= _SET || tok == _BACKREF

+ || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD

+ || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD))

+ {

+ addtok(tok);

+ tok = lex();

+ }

+ else if (tok == _LPAREN)

+ {

+ tok = lex();

+ regexp();

+ if (tok != _RPAREN)

+ reg_error("Unbalanced (");

+ tok = lex();

+ }

+ else

+ addtok(_EMPTY);

+static void

+closure()

+ atom();

+ while (tok == _QMARK || tok == _STAR || tok == _PLUS)

+ {

+ addtok(tok);

+ tok = lex();

+ }

+static void

+branch()

+ closure();

+ while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0)

+ {

+ closure();

+ addtok(_CAT);

+ }

+static void

+regexp()

+ branch();

+ while (tok == _OR)

+ {

+ tok = lex();

+ branch();

+ addtok(_OR);

+ }

+/* Main entry point for the parser. S is a string to be parsed, len is the

+ length of the string, so s can include NUL characters. R is a pointer to

+ the struct regexp to parse into. */

+void

+regparse(s, len, r)

+ const char *s;

+ size_t len;

+ struct regexp *r;

+ reg = r;

+ lexstart = lexptr = s;

+ lexleft = len;

+ caret_allowed = 1;

+ closure_allowed = 0;

+ if (! syntax_bits_set)

+ reg_error("No syntax specified");

+ tok = lex();

+ depth = r->depth;

+ if (tok == _ALLBEGLINE)

+ {

+ addtok(_BEGLINE);

+ tok = lex();

+ regexp();

+ addtok(_CAT);

+ }

+ else

+ regexp();

+ if (tok == _ALLENDLINE)

+ {

+ addtok(_ENDLINE);

+ addtok(_CAT);

+ tok = lex();

+ }

+ if (tok != _END)

+ reg_error("Unbalanced )");

+ addtok(_END - r->nregexps);

+ addtok(_CAT);

+ if (r->nregexps)

+ addtok(_OR);

+ ++r->nregexps;

+/* Some primitives for operating on sets of positions. */

+/* Copy one set to another; the destination must be large enough. */

+static void

+copy(src, dst)

+ const _position_set *src;

+ _position_set *dst;

+ int i;

+ for (i = 0; i < src->nelem; ++i)

+ dst->elems[i] = src->elems[i];

+ dst->nelem = src->nelem;

+/* Insert a position in a set. Position sets are maintained in sorted

+ order according to index. If position already exists in the set with

+ the same index then their constraints are logically or'd together.

+ S->elems must point to an array large enough to hold the resulting set. */

+static void

+insert(p, s)

+ _position p;

+ _position_set *s;

+ int i;

+ _position t1, t2;

+ for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)

+ ;

+ if (i < s->nelem && p.index == s->elems[i].index)

+ s->elems[i].constraint |= p.constraint;

+ else

+ {

+ t1 = p;

+ ++s->nelem;

+ while (i < s->nelem)

+ {

+ t2 = s->elems[i];

+ s->elems[i++] = t1;

+ t1 = t2;

+ }

+/* Merge two sets of positions into a third. The result is exactly as if

+ the positions of both sets were inserted into an initially empty set. */

+static void

+merge(s1, s2, m)

+ _position_set *s1;

+ _position_set *s2;

+ _position_set *m;

+ int i = 0, j = 0;

+ m->nelem = 0;

+ while (i < s1->nelem && j < s2->nelem)

+ if (s1->elems[i].index > s2->elems[j].index)

+ m->elems[m->nelem++] = s1->elems[i++];

+ else if (s1->elems[i].index < s2->elems[j].index)

+ m->elems[m->nelem++] = s2->elems[j++];

+ else

+ {

+ m->elems[m->nelem] = s1->elems[i++];

+ m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;

+ }

+ while (i < s1->nelem)

+ m->elems[m->nelem++] = s1->elems[i++];

+ while (j < s2->nelem)

+ m->elems[m->nelem++] = s2->elems[j++];

+/* Delete a position from a set. */

+static void

+delete(p, s)

+ _position p;

+ _position_set *s;

+ int i;

+ for (i = 0; i < s->nelem; ++i)

+ if (p.index == s->elems[i].index)

+ break;

+ if (i < s->nelem)

+ for (--s->nelem; i < s->nelem; ++i)

+ s->elems[i] = s->elems[i + 1];

+/* Find the index of the state corresponding to the given position set with

+ the given preceding context, or create a new state if there is no such

+ state. Newline and letter tell whether we got here on a newline or

+ letter, respectively. */

+static int

+state_index(r, s, newline, letter)

+ struct regexp *r;

+ _position_set *s;

+ int newline;

+ int letter;

+ int lhash = 0;

+ int constraint;

+ int i, j;

+ newline = newline ? 1 : 0;

+ letter = letter ? 1 : 0;

+ for (i = 0; i < s->nelem; ++i)

+ lhash ^= s->elems[i].index + s->elems[i].constraint;

+ /* Try to find a state that exactly matches the proposed one. */

+ for (i = 0; i < r->sindex; ++i)

+ {

+ if (lhash != r->states[i].hash || s->nelem != r->states[i].elems.nelem

+ || newline != r->states[i].newline || letter != r->states[i].letter)

+ continue;

+ for (j = 0; j < s->nelem; ++j)

+ if (s->elems[j].constraint

+ != r->states[i].elems.elems[j].constraint

+ || s->elems[j].index != r->states[i].elems.elems[j].index)

+ break;

+ if (j == s->nelem)

+ return i;

+ }

+ /* We'll have to create a new state. */

+ REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex);

+ r->states[i].hash = lhash;

+ MALLOC(r->states[i].elems.elems, _position, s->nelem);

+ copy(s, &r->states[i].elems);

+ r->states[i].newline = newline;

+ r->states[i].letter = letter;

+ r->states[i].backref = 0;

+ r->states[i].constraint = 0;

+ r->states[i].first_end = 0;

+ for (j = 0; j < s->nelem; ++j)

+ if (r->tokens[s->elems[j].index] < 0)

+ {

+ constraint = s->elems[j].constraint;

+ if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)

+ || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)

+ || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)

+ || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))

+ r->states[i].constraint |= constraint;

+ if (! r->states[i].first_end)

+ r->states[i].first_end = r->tokens[s->elems[j].index];

+ }

+ else if (r->tokens[s->elems[j].index] == _BACKREF)

+ {

+ r->states[i].constraint = _NO_CONSTRAINT;

+ r->states[i].backref = 1;

+ }

+ ++r->sindex;

+ return i;

+/* Find the epsilon closure of a set of positions. If any position of the set

+ contains a symbol that matches the empty string in some context, replace

+ that position with the elements of its follow labeled with an appropriate

+ constraint. Repeat exhaustively until no funny positions are left.

+ S->elems must be large enough to hold the result. */

+static void

+epsclosure(s, r)

+ _position_set *s;

+ struct regexp *r;

+ int i, j;

+ int *visited;

+ _position p, old;

+ MALLOC(visited, int, r->tindex);

+ for (i = 0; i < r->tindex; ++i)

+ visited[i] = 0;

+ for (i = 0; i < s->nelem; ++i)

+ if (r->tokens[s->elems[i].index] >= _NOTCHAR

+ && r->tokens[s->elems[i].index] != _BACKREF

+ && r->tokens[s->elems[i].index] < _SET)

+ {

+ old = s->elems[i];

+ p.constraint = old.constraint;

+ delete(s->elems[i], s);

+ if (visited[old.index])

+ {

+ --i;

+ continue;

+ }

+ visited[old.index] = 1;

+ switch (r->tokens[old.index])

+ {

+ case _BEGLINE:

+ p.constraint &= _BEGLINE_CONSTRAINT;

+ break;

+ case _ENDLINE:

+ p.constraint &= _ENDLINE_CONSTRAINT;

+ break;

+ case _BEGWORD:

+ p.constraint &= _BEGWORD_CONSTRAINT;

+ break;

+ case _ENDWORD:

+ p.constraint &= _ENDWORD_CONSTRAINT;

+ break;

+ case _LIMWORD:

+ p.constraint &= _ENDWORD_CONSTRAINT;

+ break;

+ case _NOTLIMWORD:

+ p.constraint &= _NOTLIMWORD_CONSTRAINT;

+ break;

+ default:

+ break;

+ }

+ for (j = 0; j < r->follows[old.index].nelem; ++j)

+ {

+ p.index = r->follows[old.index].elems[j].index;

+ insert(p, s);

+ }

+ /* Force rescan to start at the beginning. */

+ i = -1;

+ }

+ free(visited);

+/* Perform bottom-up analysis on the parse tree, computing various functions.

+ Note that at this point, we're pretending constructs like \< are real

+ characters rather than constraints on what can follow them.

+ Nullable: A node is nullable if it is at the root of a regexp that can

+ match the empty string.

+ * _EMPTY leaves are nullable.

+ * No other leaf is nullable.

+ * A _QMARK or _STAR node is nullable.

+ * A _PLUS node is nullable if its argument is nullable.

+ * A _CAT node is nullable if both its arguments are nullable.

+ * An _OR node is nullable if either argument is nullable.

+ Firstpos: The firstpos of a node is the set of positions (nonempty leaves)

+ that could correspond to the first character of a string matching the

+ regexp rooted at the given node.

+ * _EMPTY leaves have empty firstpos.

+ * The firstpos of a nonempty leaf is that leaf itself.

+ * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its

+ argument.

+ * The firstpos of a _CAT node is the firstpos of the left argument, union

+ the firstpos of the right if the left argument is nullable.

+ * The firstpos of an _OR node is the union of firstpos of each argument.

+ Lastpos: The lastpos of a node is the set of positions that could

+ correspond to the last character of a string matching the regexp at

+ the given node.

+ * _EMPTY leaves have empty lastpos.

+ * The lastpos of a nonempty leaf is that leaf itself.

+ * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its

+ argument.

+ * The lastpos of a _CAT node is the lastpos of its right argument, union

+ the lastpos of the left if the right argument is nullable.

+ * The lastpos of an _OR node is the union of the lastpos of each argument.

+ Follow: The follow of a position is the set of positions that could

+ correspond to the character following a character matching the node in

+ a string matching the regexp. At this point we consider special symbols

+ that match the empty string in some context to be just normal characters.

+ Later, if we find that a special symbol is in a follow set, we will

+ replace it with the elements of its follow, labeled with an appropriate

+ constraint.

+ * Every node in the firstpos of the argument of a _STAR or _PLUS node is in

+ the follow of every node in the lastpos.

+ * Every node in the firstpos of the second argument of a _CAT node is in

+ the follow of every node in the lastpos of the first argument.

+ Because of the postfix representation of the parse tree, the depth-first

+ analysis is conveniently done by a linear scan with the aid of a stack.

+ Sets are stored as arrays of the elements, obeying a stack-like allocation

+ scheme; the number of elements in each set deeper in the stack can be

+ used to determine the address of a particular set's array. */

+void

+reganalyze(r, searchflag)

+ struct regexp *r;

+ int searchflag;

+ int *nullable; /* Nullable stack. */

+ int *nfirstpos; /* Element count stack for firstpos sets. */

+ _position *firstpos; /* Array where firstpos elements are stored. */

+ int *nlastpos; /* Element count stack for lastpos sets. */

+ _position *lastpos; /* Array where lastpos elements are stored. */

+ int *nalloc; /* Sizes of arrays allocated to follow sets. */

+ _position_set tmp; /* Temporary set for merging sets. */

+ _position_set merged; /* Result of merging sets. */

+ int wants_newline; /* True if some position wants newline info. */

+ int *o_nullable;

+ int *o_nfirst, *o_nlast;

+ _position *o_firstpos, *o_lastpos;

+ int i, j;

+ _position *pos;

+ r->searchflag = searchflag;

+ MALLOC(nullable, int, r->depth);

+ o_nullable = nullable;

+ MALLOC(nfirstpos, int, r->depth);

+ o_nfirst = nfirstpos;

+ MALLOC(firstpos, _position, r->nleaves);

+ o_firstpos = firstpos, firstpos += r->nleaves;

+ MALLOC(nlastpos, int, r->depth);

+ o_nlast = nlastpos;

+ MALLOC(lastpos, _position, r->nleaves);

+ o_lastpos = lastpos, lastpos += r->nleaves;

+ MALLOC(nalloc, int, r->tindex);

+ for (i = 0; i < r->tindex; ++i)

+ nalloc[i] = 0;

+ MALLOC(merged.elems, _position, r->nleaves);

+ CALLOC(r->follows, _position_set, r->tindex);

+ for (i = 0; i < r->tindex; ++i)

+ switch (r->tokens[i])

+ {

+ case _EMPTY:

+ /* The empty set is nullable. */

+ *nullable++ = 1;

+ /* The firstpos and lastpos of the empty leaf are both empty. */

+ *nfirstpos++ = *nlastpos++ = 0;

+ break;

+ case _STAR:

+ case _PLUS:

+ /* Every element in the firstpos of the argument is in the follow

+ of every element in the lastpos. */

+ tmp.nelem = nfirstpos[-1];

+ tmp.elems = firstpos;

+ pos = lastpos;

+ for (j = 0; j < nlastpos[-1]; ++j)

+ {

+ merge(&tmp, &r->follows[pos[j].index], &merged);

+ REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,

+ nalloc[pos[j].index], merged.nelem - 1);

+ copy(&merged, &r->follows[pos[j].index]);

+ }

+ case _QMARK:

+ /* A _QMARK or _STAR node is automatically nullable. */

+ if (r->tokens[i] != _PLUS)

+ nullable[-1] = 1;

+ break;

+ case _CAT:

+ /* Every element in the firstpos of the second argument is in the

+ follow of every element in the lastpos of the first argument. */

+ tmp.nelem = nfirstpos[-1];

+ tmp.elems = firstpos;

+ pos = lastpos + nlastpos[-1];

+ for (j = 0; j < nlastpos[-2]; ++j)

+ {

+ merge(&tmp, &r->follows[pos[j].index], &merged);

+ REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,

+ nalloc[pos[j].index], merged.nelem - 1);

+ copy(&merged, &r->follows[pos[j].index]);

+ }

+ /* The firstpos of a _CAT node is the firstpos of the first argument,

+ union that of the second argument if the first is nullable. */

+ if (nullable[-2])

+ nfirstpos[-2] += nfirstpos[-1];

+ else

+ firstpos += nfirstpos[-1];

+ --nfirstpos;

+ /* The lastpos of a _CAT node is the lastpos of the second argument,

+ union that of the first argument if the second is nullable. */

+ if (nullable[-1])

+ nlastpos[-2] += nlastpos[-1];

+ else

+ {

+ pos = lastpos + nlastpos[-2];

+ for (j = nlastpos[-1] - 1; j >= 0; --j)

+ pos[j] = lastpos[j];

+ lastpos += nlastpos[-2];

+ nlastpos[-2] = nlastpos[-1];

+ }

+ --nlastpos;

+ /* A _CAT node is nullable if both arguments are nullable. */

+ nullable[-2] = nullable[-1] && nullable[-2];

+ --nullable;

+ break;

+ case _OR:

+ /* The firstpos is the union of the firstpos of each argument. */

+ nfirstpos[-2] += nfirstpos[-1];

+ --nfirstpos;

+ /* The lastpos is the union of the lastpos of each argument. */

+ nlastpos[-2] += nlastpos[-1];

+ --nlastpos;

+ /* An _OR node is nullable if either argument is nullable. */

+ nullable[-2] = nullable[-1] || nullable[-2];

+ --nullable;

+ break;

+ default:

+ /* Anything else is a nonempty position. (Note that special

+ constructs like \< are treated as nonempty strings here;

+ an "epsilon closure" effectively makes them nullable later.

+ Backreferences have to get a real position so we can detect

+ transitions on them later. But they are nullable. */

+ *nullable++ = r->tokens[i] == _BACKREF;

+ /* This position is in its own firstpos and lastpos. */

+ *nfirstpos++ = *nlastpos++ = 1;

+ --firstpos, --lastpos;

+ firstpos->index = lastpos->index = i;

+ firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT;

+ /* Allocate the follow set for this position. */

+ nalloc[i] = 1;

+ MALLOC(r->follows[i].elems, _position, nalloc[i]);

+ break;

+ }

+ /* For each follow set that is the follow set of a real position, replace

+ it with its epsilon closure. */

+ for (i = 0; i < r->tindex; ++i)

+ if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF

+ || r->tokens[i] >= _SET)

+ {

+ copy(&r->follows[i], &merged);

+ epsclosure(&merged, r);

+ if (r->follows[i].nelem < merged.nelem)

+ REALLOC(r->follows[i].elems, _position, merged.nelem);

+ copy(&merged, &r->follows[i]);

+ }

+ /* Get the epsilon closure of the firstpos of the regexp. The result will

+ be the set of positions of state 0. */

+ merged.nelem = 0;

+ for (i = 0; i < nfirstpos[-1]; ++i)

+ insert(firstpos[i], &merged);

+ epsclosure(&merged, r);

+ /* Check if any of the positions of state 0 will want newline context. */

+ wants_newline = 0;

+ for (i = 0; i < merged.nelem; ++i)

+ if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))

+ wants_newline = 1;

+ /* Build the initial state. */

+ r->salloc = 1;

+ r->sindex = 0;

+ MALLOC(r->states, _dfa_state, r->salloc);

+ state_index(r, &merged, wants_newline, 0);

+ free(o_nullable);

+ free(o_nfirst);

+ free(o_firstpos);

+ free(o_nlast);

+ free(o_lastpos);

+ free(nalloc);

+ free(merged.elems);

+/* Find, for each character, the transition out of state s of r, and store

+ it in the appropriate slot of trans.

+ We divide the positions of s into groups (positions can appear in more

+ than one group). Each group is labeled with a set of characters that

+ every position in the group matches (taking into account, if necessary,

+ preceding context information of s). For each group, find the union

+ of the its elements' follows. This set is the set of positions of the

+ new state. For each character in the group's label, set the transition

+ on this character to be to a state corresponding to the set's positions,

+ and its associated backward context information, if necessary.

+ If we are building a searching matcher, we include the positions of state

+ 0 in every state.

+ The collection of groups is constructed by building an equivalence-class

+ partition of the positions of s.

+ For each position, find the set of characters C that it matches. Eliminate

+ any characters from C that fail on grounds of backward context.

+ Search through the groups, looking for a group whose label L has nonempty

+ intersection with C. If L - C is nonempty, create a new group labeled

+ L - C and having the same positions as the current group, and set L to

+ the intersection of L and C. Insert the position in this group, set

+ C = C - L, and resume scanning.

+ If after comparing with every group there are characters remaining in C,

+ create a new group labeled with the characters of C and insert this

+ position in that group. */

+void

+regstate(s, r, trans)

+ int s;

+ struct regexp *r;

+ int trans[];

+ _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */

+ _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */

+ int ngrps = 0; /* Number of groups actually used. */

+ _position pos; /* Current position being considered. */

+ _charset matches; /* Set of matching characters. */

+ int matchesf; /* True if matches is nonempty. */

+ _charset intersect; /* Intersection with some label set. */

+ int intersectf; /* True if intersect is nonempty. */

+ _charset leftovers; /* Stuff in the label that didn't match. */

+ int leftoversf; /* True if leftovers is nonempty. */

+ static _charset letters; /* Set of characters considered letters. */

+ static _charset newline; /* Set of characters that aren't newline. */

+ _position_set follows; /* Union of the follows of some group. */

+ _position_set tmp; /* Temporary space for merging sets. */

+ int state; /* New state. */

+ int wants_newline; /* New state wants to know newline context. */

+ int state_newline; /* New state on a newline transition. */

+ int wants_letter; /* New state wants to know letter context. */

+ int state_letter; /* New state on a letter transition. */

+ static initialized; /* Flag for static initialization. */

+ int i, j, k;

+ /* Initialize the set of letters, if necessary. */

+ if (! initialized)

+ {

+ initialized = 1;

+ for (i = 0; i < _NOTCHAR; ++i)

+ if (ISALNUM(i))

+ setbit(i, letters);

+ setbit('\n', newline);

+ }

+ zeroset(matches);

+ for (i = 0; i < r->states[s].elems.nelem; ++i)

+ {

+ pos = r->states[s].elems.elems[i];

+ if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR)

+ setbit(r->tokens[pos.index], matches);

+ else if (r->tokens[pos.index] >= _SET)

+ copyset(r->charsets[r->tokens[pos.index] - _SET], matches);

+ else

+ continue;

+ /* Some characters may need to be climinated from matches because

+ they fail in the current context. */

+ if (pos.constraint != 0xff)

+ {

+ if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,

+ r->states[s].newline, 1))

+ clrbit('\n', matches);

+ if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,

+ r->states[s].newline, 0))

+ for (j = 0; j < _CHARSET_INTS; ++j)

+ matches[j] &= newline[j];

+ if (! _MATCHES_LETTER_CONTEXT(pos.constraint,

+ r->states[s].letter, 1))

+ for (j = 0; j < _CHARSET_INTS; ++j)

+ matches[j] &= ~letters[j];

+ if (! _MATCHES_LETTER_CONTEXT(pos.constraint,

+ r->states[s].letter, 0))

+ for (j = 0; j < _CHARSET_INTS; ++j)

+ matches[j] &= letters[j];

+ /* If there are no characters left, there's no point in going on. */

+ for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j)

+ ;

+ if (j == _CHARSET_INTS)

+ continue;

+ }

+ for (j = 0; j < ngrps; ++j)

+ {

+ /* If matches contains a single character only, and the current

+ group's label doesn't contain that character, go on to the

+ next group. */

+ if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR

+ && !tstbit(r->tokens[pos.index], labels[j]))

+ continue;

+ /* Check if this group's label has a nonempty intersection with

+ matches. */

+ intersectf = 0;

+ for (k = 0; k < _CHARSET_INTS; ++k)

+ (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0;

+ if (! intersectf)

+ continue;

+ /* It does; now find the set differences both ways. */

+ leftoversf = matchesf = 0;

+ for (k = 0; k < _CHARSET_INTS; ++k)

+ {

+ /* Even an optimizing compiler can't know this for sure. */

+ int match = matches[k], label = labels[j][k];

+ (leftovers[k] = ~match & label) ? leftoversf = 1 : 0;

+ (matches[k] = match & ~label) ? matchesf = 1 : 0;

+ }

+ /* If there were leftovers, create a new group labeled with them. */

+ if (leftoversf)

+ {

+ copyset(leftovers, labels[ngrps]);

+ copyset(intersect, labels[j]);

+ MALLOC(grps[ngrps].elems, _position, r->nleaves);

+ copy(&grps[j], &grps[ngrps]);

+ ++ngrps;

+ }

+ /* Put the position in the current group. Note that there is no

+ reason to call insert() here. */

+ grps[j].elems[grps[j].nelem++] = pos;

+ /* If every character matching the current position has been

+ accounted for, we're done. */

+ if (! matchesf)

+ break;

+ }

+ /* If we've passed the last group, and there are still characters

+ unaccounted for, then we'll have to create a new group. */

+ if (j == ngrps)

+ {

+ copyset(matches, labels[ngrps]);

+ zeroset(matches);

+ MALLOC(grps[ngrps].elems, _position, r->nleaves);

+ grps[ngrps].nelem = 1;

+ grps[ngrps].elems[0] = pos;

+ ++ngrps;

+ }

+ MALLOC(follows.elems, _position, r->nleaves);

+ MALLOC(tmp.elems, _position, r->nleaves);

+ /* If we are a searching matcher, the default transition is to a state

+ containing the positions of state 0, otherwise the default transition

+ is to fail miserably. */

+ if (r->searchflag)

+ {

+ wants_newline = 0;

+ wants_letter = 0;

+ for (i = 0; i < r->states[0].elems.nelem; ++i)

+ {

+ if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint))

+ wants_newline = 1;

+ if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint))

+ wants_letter = 1;

+ }

+ copy(&r->states[0].elems, &follows);

+ state = state_index(r, &follows, 0, 0);

+ if (wants_newline)

+ state_newline = state_index(r, &follows, 1, 0);

+ else

+ state_newline = state;

+ if (wants_letter)

+ state_letter = state_index(r, &follows, 0, 1);

+ else

+ state_letter = state;

+ for (i = 0; i < _NOTCHAR; ++i)

+ trans[i] = (ISALNUM(i)) ? state_letter : state ;

+ trans['\n'] = state_newline;

+ }

+ else

+ for (i = 0; i < _NOTCHAR; ++i)

+ trans[i] = -1;

+ for (i = 0; i < ngrps; ++i)

+ {

+ follows.nelem = 0;

+ /* Find the union of the follows of the positions of the group.

+ This is a hideously inefficient loop. Fix it someday. */

+ for (j = 0; j < grps[i].nelem; ++j)

+ for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k)

+ insert(r->follows[grps[i].elems[j].index].elems[k], &follows);

+ /* If we are building a searching matcher, throw in the positions

+ of state 0 as well. */

+ if (r->searchflag)

+ for (j = 0; j < r->states[0].elems.nelem; ++j)

+ insert(r->states[0].elems.elems[j], &follows);

+ /* Find out if the new state will want any context information. */

+ wants_newline = 0;

+ if (tstbit('\n', labels[i]))

+ for (j = 0; j < follows.nelem; ++j)

+ if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))

+ wants_newline = 1;

+ wants_letter = 0;

+ for (j = 0; j < _CHARSET_INTS; ++j)

+ if (labels[i][j] & letters[j])

+ break;

+ if (j < _CHARSET_INTS)

+ for (j = 0; j < follows.nelem; ++j)

+ if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint))

+ wants_letter = 1;

+ /* Find the state(s) corresponding to the union of the follows. */

+ state = state_index(r, &follows, 0, 0);

+ if (wants_newline)

+ state_newline = state_index(r, &follows, 1, 0);

+ else

+ state_newline = state;

+ if (wants_letter)

+ state_letter = state_index(r, &follows, 0, 1);

+ else

+ state_letter = state;

+ /* Set the transitions for each character in the current label. */

+ for (j = 0; j < _CHARSET_INTS; ++j)

+ for (k = 0; k < INTBITS; ++k)

+ if (labels[i][j] & 1 << k)

+ {

+ int c = j * INTBITS + k;

+ if (c == '\n')

+ trans[c] = state_newline;

+ else if (ISALNUM(c))

+ trans[c] = state_letter;

+ else if (c < _NOTCHAR)

+ trans[c] = state;

+ }

+ for (i = 0; i < ngrps; ++i)

+ free(grps[i].elems);

+ free(follows.elems);

+ free(tmp.elems);

+/* Some routines for manipulating a compiled regexp's transition tables.

+ Each state may or may not have a transition table; if it does, and it

+ is a non-accepting state, then r->trans[state] points to its table.

+ If it is an accepting state then r->fails[state] points to its table.

+ If it has no table at all, then r->trans[state] is NULL.

+ TODO: Improve this comment, get rid of the unnecessary redundancy. */

+static void

+build_state(s, r)

+ int s;

+ struct regexp *r;

+ int *trans; /* The new transition table. */

+ int i;

+ /* Set an upper limit on the number of transition tables that will ever

+ exist at once. 1024 is arbitrary. The idea is that the frequently

+ used transition tables will be quickly rebuilt, whereas the ones that

+ were only needed once or twice will be cleared away. */

+ if (r->trcount >= 1024)

+ {

+ for (i = 0; i < r->tralloc; ++i)

+ if (r->trans[i])

+ {

+ free((ptr_t) r->trans[i]);

+ r->trans[i] = NULL;

+ }

+ else if (r->fails[i])

+ {

+ free((ptr_t) r->fails[i]);

+ r->fails[i] = NULL;

+ }

+ r->trcount = 0;

+ }

+ ++r->trcount;

+ /* Set up the success bits for this state. */

+ r->success[s] = 0;

+ if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0,

+ s, *r))

+ r->success[s] |= 4;

+ if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1,

+ s, *r))

+ r->success[s] |= 2;

+ if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0,

+ s, *r))

+ r->success[s] |= 1;

+ MALLOC(trans, int, _NOTCHAR);

+ regstate(s, r, trans);

+ /* Now go through the new transition table, and make sure that the trans

+ and fail arrays are allocated large enough to hold a pointer for the

+ largest state mentioned in the table. */

+ for (i = 0; i < _NOTCHAR; ++i)

+ if (trans[i] >= r->tralloc)

+ {

+ int oldalloc = r->tralloc;

+ while (trans[i] >= r->tralloc)

+ r->tralloc *= 2;

+ REALLOC(r->realtrans, int *, r->tralloc + 1);

+ r->trans = r->realtrans + 1;

+ REALLOC(r->fails, int *, r->tralloc);

+ REALLOC(r->success, int, r->tralloc);

+ REALLOC(r->newlines, int, r->tralloc);

+ while (oldalloc < r->tralloc)

+ {

+ r->trans[oldalloc] = NULL;

+ r->fails[oldalloc++] = NULL;

+ }

+ /* Keep the newline transition in a special place so we can use it as

+ a sentinel. */

+ r->newlines[s] = trans['\n'];

+ trans['\n'] = -1;

+ if (ACCEPTING(s, *r))

+ r->fails[s] = trans;

+ else

+ r->trans[s] = trans;

+static void

+build_state_zero(r)

+ struct regexp *r;

+ r->tralloc = 1;

+ r->trcount = 0;

+ CALLOC(r->realtrans, int *, r->tralloc + 1);

+ r->trans = r->realtrans + 1;

+ CALLOC(r->fails, int *, r->tralloc);

+ MALLOC(r->success, int, r->tralloc);

+ MALLOC(r->newlines, int, r->tralloc);

+ build_state(0, r);

+/* Search through a buffer looking for a match to the given struct regexp.

+ Find the first occurrence of a string matching the regexp in the buffer,

+ and the shortest possible version thereof. Return a pointer to the first

+ character after the match, or NULL if none is found. Begin points to

+ the beginning of the buffer, and end points to the first character after

+ its end. We store a newline in *end to act as a sentinel, so end had

+ better point somewhere valid. Newline is a flag indicating whether to

+ allow newlines to be in the matching string. If count is non-

+ NULL it points to a place we're supposed to increment every time we

+ see a newline. Finally, if backref is non-NULL it points to a place

+ where we're supposed to store a 1 if backreferencing happened and the

+ match needs to be verified by a backtracking matcher. Otherwise

+ we store a 0 in *backref. */

+char *

+regexecute(r, begin, end, newline, count, backref)

+ struct regexp *r;

+ char *begin;

+ char *end;

+ int newline;

+ int *count;

+ int *backref;

+ register s, s1, tmp; /* Current state. */

+ register unsigned char *p; /* Current input character. */

+ register **trans, *t; /* Copy of r->trans so it can be optimized

+ into a register. */

+ static sbit[_NOTCHAR]; /* Table for anding with r->success. */

+ static sbit_init;

+ if (! sbit_init)

+ {

+ int i;

+ sbit_init = 1;

+ for (i = 0; i < _NOTCHAR; ++i)

+ sbit[i] = (ISALNUM(i)) ? 2 : 1;

+ sbit['\n'] = 4;

+ }

+ if (! r->tralloc)

+ build_state_zero(r);

+ s = s1 = 0;

+ p = (unsigned char *) begin;

+ trans = r->trans;

+ *end = '\n';

+ for (;;)

+ {

+ while ((t = trans[s]) != 0) { /* hand-optimized loop */

+ s1 = t[*p++];

+ if ((t = trans[s1]) == 0) {

+ tmp = s ; s = s1 ; s1 = tmp ; /* swap */

+ break;

+ }

+ s = t[*p++];

+ }

+ if (s >= 0 && p <= (unsigned char *) end && r->fails[s])

+ {

+ if (r->success[s] & sbit[*p])

+ {

+ if (backref)

+ *backref = (r->states[s].backref != 0);

+ return (char *) p;

+ }

+ s1 = s;

+ s = r->fails[s][*p++];

+ continue;

+ }

+ /* If the previous character was a newline, count it. */

+ if (count && (char *) p <= end && p[-1] == '\n')

+ ++*count;

+ /* Check if we've run off the end of the buffer. */

+ if ((char *) p >= end)

+ return NULL;

+ if (s >= 0)

+ {

+ build_state(s, r);

+ trans = r->trans;

+ continue;

+ }

+ if (p[-1] == '\n' && newline)

+ {

+ s = r->newlines[s1];

+ continue;

+ }

+ s = 0;

+ }

+/* Initialize the components of a regexp that the other routines don't

+ initialize for themselves. */

+void

+reginit(r)

+ struct regexp *r;

+ r->calloc = 1;

+ MALLOC(r->charsets, _charset, r->calloc);

+ r->cindex = 0;

+ r->talloc = 1;

+ MALLOC(r->tokens, _token, r->talloc);

+ r->tindex = r->depth = r->nleaves = r->nregexps = 0;

+ r->searchflag = 0;

+ r->tralloc = 0;

+/* Parse and analyze a single string of the given length. */

+void

+regcompile(s, len, r, searchflag)

+ const char *s;

+ size_t len;

+ struct regexp *r;

+ int searchflag;

+ if (case_fold) /* dummy folding in service of regmust() */

+ {

+ char *regcopy;

+ int i;

+ regcopy = malloc(len);

+ if (!regcopy)

+ reg_error("out of memory");

+ /* This is a complete kludge and could potentially break

+ \<letter> escapes . . . */

+ case_fold = 0;

+ for (i = 0; i < len; ++i)

+ if (ISUPPER(s[i]))

+ regcopy[i] = tolower(s[i]);

+ else

+ regcopy[i] = s[i];

+ reginit(r);

+ r->mustn = 0;

+ r->must[0] = '\0';

+ regparse(regcopy, len, r);

+ free(regcopy);

+ regmust(r);

+ reganalyze(r, searchflag);

+ case_fold = 1;

+ reginit(r);

+ regparse(s, len, r);

+ reganalyze(r, searchflag);

+ }

+ else

+ {

+ reginit(r);

+ regparse(s, len, r);

+ regmust(r);

+ reganalyze(r, searchflag);

+ }

+/* Free the storage held by the components of a regexp. */

+void

+reg_free(r)

+ struct regexp *r;

+ int i;

+ free((ptr_t) r->charsets);

+ free((ptr_t) r->tokens);

+ for (i = 0; i < r->sindex; ++i)

+ free((ptr_t) r->states[i].elems.elems);

+ free((ptr_t) r->states);

+ for (i = 0; i < r->tindex; ++i)

+ if (r->follows[i].elems)

+ free((ptr_t) r->follows[i].elems);

+ free((ptr_t) r->follows);

+ for (i = 0; i < r->tralloc; ++i)

+ if (r->trans[i])

+ free((ptr_t) r->trans[i]);

+ else if (r->fails[i])

+ free((ptr_t) r->fails[i]);

+ if (r->realtrans)

+ free((ptr_t) r->realtrans);

+ if (r->fails)

+ free((ptr_t) r->fails);

+ if (r->newlines)

+ free((ptr_t) r->newlines);

+/*

+Having found the postfix representation of the regular expression,

+try to find a long sequence of characters that must appear in any line

+containing the r.e.

+Finding a "longest" sequence is beyond the scope here;

+we take an easy way out and hope for the best.

+(Take "(ab|a)b"--please.)

+We do a bottom-up calculation of sequences of characters that must appear

+in matches of r.e.'s represented by trees rooted at the nodes of the postfix

+representation:

+ sequences that must appear at the left of the match ("left")

+ sequences that must appear at the right of the match ("right")

+ lists of sequences that must appear somewhere in the match ("in")

+ sequences that must constitute the match ("is")

+When we get to the root of the tree, we use one of the longest of its

+calculated "in" sequences as our answer. The sequence we find is returned in

+r->must (where "r" is the single argument passed to "regmust");

+the length of the sequence is returned in r->mustn.

+The sequences calculated for the various types of node (in pseudo ANSI c)

+are shown below. "p" is the operand of unary operators (and the left-hand

+operand of binary operators); "q" is the right-hand operand of binary operators

+"ZERO" means "a zero-length sequence" below.

+Type left right is in

+---- ---- ----- -- --

+char c # c # c # c # c

+SET ZERO ZERO ZERO ZERO

+STAR ZERO ZERO ZERO ZERO

+QMARK ZERO ZERO ZERO ZERO

+PLUS p->left p->right ZERO p->in

+CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus

+ p->left : q->right : q->is!=ZERO) ? q->in plus

+ p->is##q->left p->right##q->is p->is##q->is : p->right##q->left

+ ZERO

+OR longest common longest common (do p->is and substrings common to

+ leading trailing q->is have same p->in and q->in

+ (sub)sequence (sub)sequence length and

+ of p->left of p->right content) ?

+ and q->left and q->right p->is : NULL

+If there's anything else we recognize in the tree, all four sequences get set

+to zero-length sequences. If there's something we don't recognize in the tree,

+we just return a zero-length sequence.

+Break ties in favor of infrequent letters (choosing 'zzz' in preference to

+'aaa')?

+And. . .is it here or someplace that we might ponder "optimizations" such as

+ egrep 'psi|epsilon' -> egrep 'psi'

+ egrep 'pepsi|epsilon' -> egrep 'epsi'

+ (Yes, we now find "epsi" as a "string

+ that must occur", but we might also

+ simplify the *entire* r.e. being sought

+ grep '[c]' -> grep 'c'

+ grep '(ab|a)b' -> grep 'ab'

+ grep 'ab*' -> grep 'a'

+ grep 'a*b' -> grep 'b'

+There are several issues:

+ Is optimization easy (enough)?

+ Does optimization actually accomplish anything,

+ or is the automaton you get from "psi|epsilon" (for example)

+ the same as the one you get from "psi" (for example)?

+ Are optimizable r.e.'s likely to be used in real-life situations

+ (something like 'ab*' is probably unlikely; something like is

+ 'psi|epsilon' is likelier)?

+*/

+static char *

+icatalloc(old, new)

+char * old;

+const char * new;

+ register char * result;

+ register int oldsize, newsize;

+ newsize = (new == NULL) ? 0 : strlen(new);

+ if (old == NULL)

+ oldsize = 0;

+ else if (newsize == 0)

+ return old;

+ else oldsize = strlen(old);

+ if (old == NULL)

+ result = (char *) malloc(newsize + 1);

+ else result = (char *) realloc((void *) old, oldsize + newsize + 1);

+ if (result != NULL && new != NULL)

+ (void) strcpy(result + oldsize, new);

+ return result;

+static char *

+icpyalloc(string)

+const char * string;

+ return icatalloc((char *) NULL, string);

+static char *

+istrstr(lookin, lookfor)

+char * lookin;

+register char * lookfor;

+ register char * cp;

+ register int len;

+ len = strlen(lookfor);

+ for (cp = lookin; *cp != '\0'; ++cp)

+ if (strncmp(cp, lookfor, len) == 0)

+ return cp;

+ return NULL;

+static void

+ifree(cp)

+char * cp;

+ if (cp != NULL)

+ free(cp);

+static void

+freelist(cpp)

+register char ** cpp;

+ register int i;

+ if (cpp == NULL)

+ return;

+ for (i = 0; cpp[i] != NULL; ++i) {

+ free(cpp[i]);

+ cpp[i] = NULL;

+ }

+static char **

+enlist(cpp, new, len)

+register char ** cpp;

+register char * new;

+#ifdef __STDC__

+size_t len;

+#else

+int len;

+#endif

+ register int i, j;

+ if (cpp == NULL)

+ return NULL;

+ if ((new = icpyalloc(new)) == NULL) {

+ freelist(cpp);

+ return NULL;

+ }

+ new[len] = '\0';

+ /*

+ ** Is there already something in the list that's new (or longer)?

+ */

+ for (i = 0; cpp[i] != NULL; ++i)

+ if (istrstr(cpp[i], new) != NULL) {

+ free(new);

+ return cpp;

+ }

+ /*

+ ** Eliminate any obsoleted strings.

+ */

+ j = 0;

+ while (cpp[j] != NULL)

+ if (istrstr(new, cpp[j]) == NULL)

+ ++j;

+ else {

+ free(cpp[j]);

+ if (--i == j)

+ break;

+ cpp[j] = cpp[i];

+ }

+ /*

+ ** Add the new string.

+ */

+ cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);

+ if (cpp == NULL)

+ return NULL;

+ cpp[i] = new;

+ cpp[i + 1] = NULL;

+ return cpp;

+/*

+** Given pointers to two strings,

+** return a pointer to an allocated list of their distinct common substrings.

+** Return NULL if something seems wild.

+*/

+static char **

+comsubs(left, right)

+char * left;

+char * right;

+ register char ** cpp;

+ register char * lcp;

+ register char * rcp;

+ register int i, len;

+ if (left == NULL || right == NULL)

+ return NULL;

+ cpp = (char **) malloc(sizeof *cpp);

+ if (cpp == NULL)

+ return NULL;

+ cpp[0] = NULL;

+ for (lcp = left; *lcp != '\0'; ++lcp) {

+ len = 0;

+ rcp = strchr(right, *lcp);

+ while (rcp != NULL) {

+ for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)

+ ;

+ if (i > len)

+ len = i;

+ rcp = strchr(rcp + 1, *lcp);

+ }

+ if (len == 0)

+ continue;

+#ifdef __STDC__

+ if ((cpp = enlist(cpp, lcp, (size_t)len)) == NULL)

+#else

+ if ((cpp = enlist(cpp, lcp, len)) == NULL)

+#endif

+ break;

+ }

+ return cpp;

+static char **

+addlists(old, new)

+char ** old;

+char ** new;

+ register int i;

+ if (old == NULL || new == NULL)

+ return NULL;

+ for (i = 0; new[i] != NULL; ++i) {

+ old = enlist(old, new[i], strlen(new[i]));

+ if (old == NULL)

+ break;

+ }

+ return old;

+/*

+** Given two lists of substrings,

+** return a new list giving substrings common to both.

+*/

+static char **

+inboth(left, right)

+char ** left;

+char ** right;

+ register char ** both;

+ register char ** temp;

+ register int lnum, rnum;

+ if (left == NULL || right == NULL)

+ return NULL;

+ both = (char **) malloc(sizeof *both);

+ if (both == NULL)

+ return NULL;

+ both[0] = NULL;

+ for (lnum = 0; left[lnum] != NULL; ++lnum) {

+ for (rnum = 0; right[rnum] != NULL; ++rnum) {

+ temp = comsubs(left[lnum], right[rnum]);

+ if (temp == NULL) {

+ freelist(both);

+ return NULL;

+ }

+ both = addlists(both, temp);

+ freelist(temp);

+ if (both == NULL)

+ return NULL;

+ }

+ return both;

+/*

+typedef struct {

+ char ** in;

+ char * left;

+ char * right;

+ char * is;

+} must;

+ */

+static void

+resetmust(mp)

+register must * mp;

+ mp->left[0] = mp->right[0] = mp->is[0] = '\0';

+ freelist(mp->in);

+static void

+regmust(r)

+register struct regexp * r;

+ register must * musts;

+ register must * mp;

+ register char * result = "";

+ register int ri;

+ register int i;

+ register _token t;

+ static must must0;

+ reg->mustn = 0;

+ reg->must[0] = '\0';

+ musts = (must *) malloc((reg->tindex + 1) * sizeof *musts);

+ if (musts == NULL)

+ return;

+ mp = musts;

+ for (i = 0; i <= reg->tindex; ++i)

+ mp[i] = must0;

+ for (i = 0; i <= reg->tindex; ++i) {

+ mp[i].in = (char **) malloc(sizeof *mp[i].in);

+ mp[i].left = malloc(2);

+ mp[i].right = malloc(2);

+ mp[i].is = malloc(2);

+ if (mp[i].in == NULL || mp[i].left == NULL ||

+ mp[i].right == NULL || mp[i].is == NULL)

+ goto done;

+ mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';

+ mp[i].in[0] = NULL;

+ }

+ for (ri = 0; ri < reg->tindex; ++ri) {

+ switch (t = reg->tokens[ri]) {

+ case _ALLBEGLINE:

+ case _ALLENDLINE:

+ case _LPAREN:

+ case _RPAREN:

+ goto done; /* "cannot happen" */

+ case _EMPTY:

+ case _BEGLINE:

+ case _ENDLINE:

+ case _BEGWORD:

+ case _ENDWORD:

+ case _LIMWORD:

+ case _NOTLIMWORD:

+ case _BACKREF:

+ resetmust(mp);

+ break;

+ case _STAR:

+ case _QMARK:

+ if (mp <= musts)

+ goto done; /* "cannot happen" */

+ --mp;

+ resetmust(mp);

+ break;

+ case _OR:

+ if (mp < &musts[2])

+ goto done; /* "cannot happen" */

+ {

+ register char ** new;

+ register must * lmp;

+ register must * rmp;

+ register int j, ln, rn, n;

+ rmp = --mp;

+ lmp = --mp;

+ /* Guaranteed to be. Unlikely, but. . . */

+ if (strcmp(lmp->is, rmp->is) != 0)

+ lmp->is[0] = '\0';

+ /* Left side--easy */

+ i = 0;

+ while (lmp->left[i] != '\0' &&

+ lmp->left[i] == rmp->left[i])

+ ++i;

+ lmp->left[i] = '\0';

+ /* Right side */

+ ln = strlen(lmp->right);

+ rn = strlen(rmp->right);

+ n = ln;

+ if (n > rn)

+ n = rn;

+ for (i = 0; i < n; ++i)

+ if (lmp->right[ln - i - 1] !=

+ rmp->right[rn - i - 1])

+ break;

+ for (j = 0; j < i; ++j)

+ lmp->right[j] =

+ lmp->right[(ln - i) + j];

+ lmp->right[j] = '\0';

+ new = inboth(lmp->in, rmp->in);

+ if (new == NULL)

+ goto done;

+ freelist(lmp->in);

+ free((char *) lmp->in);

+ lmp->in = new;

+ }

+ break;

+ case _PLUS:

+ if (mp <= musts)

+ goto done; /* "cannot happen" */

+ --mp;

+ mp->is[0] = '\0';

+ break;

+ case _END:

+ if (mp != &musts[1])

+ goto done; /* "cannot happen" */

+ for (i = 0; musts[0].in[i] != NULL; ++i)

+ if (strlen(musts[0].in[i]) > strlen(result))

+ result = musts[0].in[i];

+ goto done;

+ case _CAT:

+ if (mp < &musts[2])

+ goto done; /* "cannot happen" */

+ {

+ register must * lmp;

+ register must * rmp;

+ rmp = --mp;

+ lmp = --mp;

+ /*

+ ** In. Everything in left, plus everything in

+ ** right, plus catenation of

+ ** left's right and right's left.

+ */

+ lmp->in = addlists(lmp->in, rmp->in);

+ if (lmp->in == NULL)

+ goto done;

+ if (lmp->right[0] != '\0' &&

+ rmp->left[0] != '\0') {

+ register char * tp;

+ tp = icpyalloc(lmp->right);

+ if (tp == NULL)

+ goto done;

+ tp = icatalloc(tp, rmp->left);

+ if (tp == NULL)

+ goto done;

+ lmp->in = enlist(lmp->in, tp,

+ strlen(tp));

+ free(tp);

+ if (lmp->in == NULL)

+ goto done;

+ }

+ /* Left-hand */

+ if (lmp->is[0] != '\0') {

+ lmp->left = icatalloc(lmp->left,

+ rmp->left);

+ if (lmp->left == NULL)

+ goto done;

+ }

+ /* Right-hand */

+ if (rmp->is[0] == '\0')

+ lmp->right[0] = '\0';

+ lmp->right = icatalloc(lmp->right, rmp->right);

+ if (lmp->right == NULL)

+ goto done;

+ /* Guaranteed to be */

+ if (lmp->is[0] != '\0' && rmp->is[0] != '\0') {

+ lmp->is = icatalloc(lmp->is, rmp->is);

+ if (lmp->is == NULL)

+ goto done;

+ }

+ break;

+ default:

+ if (t < _END) {

+ /* "cannot happen" */

+ goto done;

+ } else if (t == '\0') {

+ /* not on *my* shift */

+ goto done;

+ } else if (t >= _SET) {

+ /* easy enough */

+ resetmust(mp);

+ } else {

+ /* plain character */

+ resetmust(mp);

+ mp->is[0] = mp->left[0] = mp->right[0] = t;

+ mp->is[1] = mp->left[1] = mp->right[1] = '\0';

+ mp->in = enlist(mp->in, mp->is, 1);

+ if (mp->in == NULL)

+ goto done;

+ }

+ break;

+ }

+ ++mp;

+ }

+done:

+ (void) strncpy(reg->must, result, MUST_MAX - 1);

+ reg->must[MUST_MAX - 1] = '\0';

+ reg->mustn = strlen(reg->must);

+ mp = musts;

+ for (i = 0; i <= reg->tindex; ++i) {

+ freelist(mp[i].in);

+ ifree((char *) mp[i].in);

+ ifree(mp[i].left);

+ ifree(mp[i].right);

+ ifree(mp[i].is);

+ }

+ free((char *) mp);

diff --git a/gnu/usr.bin/awk/dfa.h b/gnu/usr.bin/awk/dfa.h
new file mode 100644
index 000000000000..65fc49565a7c
--- /dev/null
+++ b/gnu/usr.bin/awk/dfa.h

@@ -0,0 +1,543 @@

+/* dfa.h - declarations for GNU deterministic regexp compiler