aboutsummaryrefslogtreecommitdiff
path: root/share/doc/papers
diff options
context:
space:
mode:
authorRodney W. Grimes <rgrimes@FreeBSD.org>1994-05-30 19:09:18 +0000
committerRodney W. Grimes <rgrimes@FreeBSD.org>1994-05-30 19:09:18 +0000
commitafe61c15161c324a7af299a9b8457aba5afc92db (patch)
tree2ced81c11d3481fb1a4d3d832089bc744304b24e /share/doc/papers
parent9f23196c427eddb59bd454053a732e7cfebcb459 (diff)
BSD 4.4 Lite Share Sources
Notes
Notes: svn path=/head/; revision=1638
Diffstat (limited to 'share/doc/papers')
-rw-r--r--share/doc/papers/beyond4.3/Makefile7
-rw-r--r--share/doc/papers/beyond4.3/beyond43.ms518
-rw-r--r--share/doc/papers/diskperf/Makefile11
-rw-r--r--share/doc/papers/diskperf/abs.ms176
-rw-r--r--share/doc/papers/diskperf/appendix.ms98
-rw-r--r--share/doc/papers/diskperf/conclusions.ms127
-rw-r--r--share/doc/papers/diskperf/equip.ms177
-rw-r--r--share/doc/papers/diskperf/methodology.ms111
-rw-r--r--share/doc/papers/diskperf/motivation.ms93
-rw-r--r--share/doc/papers/diskperf/results.ms337
-rw-r--r--share/doc/papers/diskperf/tests.ms108
-rw-r--r--share/doc/papers/fsinterface/Makefile7
-rw-r--r--share/doc/papers/fsinterface/abstract.ms73
-rw-r--r--share/doc/papers/fsinterface/fsinterface.ms1176
-rw-r--r--share/doc/papers/fsinterface/slides.t318
-rw-r--r--share/doc/papers/kernmalloc/Makefile11
-rw-r--r--share/doc/papers/kernmalloc/alloc.fig115
-rw-r--r--share/doc/papers/kernmalloc/appendix.t137
-rw-r--r--share/doc/papers/kernmalloc/kernmalloc.t649
-rw-r--r--share/doc/papers/kernmalloc/spell.ok57
-rw-r--r--share/doc/papers/kernmalloc/usage.tbl75
-rw-r--r--share/doc/papers/kerntune/0.t129
-rw-r--r--share/doc/papers/kerntune/1.t48
-rw-r--r--share/doc/papers/kerntune/2.t234
-rw-r--r--share/doc/papers/kerntune/3.t290
-rw-r--r--share/doc/papers/kerntune/4.t99
-rw-r--r--share/doc/papers/kerntune/Makefile10
-rw-r--r--share/doc/papers/kerntune/fig2.pic57
-rw-r--r--share/doc/papers/memfs/0.t86
-rw-r--r--share/doc/papers/memfs/1.t392
-rw-r--r--share/doc/papers/memfs/A.t173
-rw-r--r--share/doc/papers/memfs/Makefile22
-rw-r--r--share/doc/papers/memfs/ref.bib49
-rw-r--r--share/doc/papers/memfs/spell.ok18
-rw-r--r--share/doc/papers/memfs/tmac.srefs177
-rw-r--r--share/doc/papers/newvm/0.t86
-rw-r--r--share/doc/papers/newvm/1.t377
-rw-r--r--share/doc/papers/newvm/Makefile10
-rw-r--r--share/doc/papers/newvm/a.t239
-rw-r--r--share/doc/papers/newvm/spell.ok56
-rw-r--r--share/doc/papers/nqnfs/Makefile10
-rw-r--r--share/doc/papers/nqnfs/nqnfs.me2007
-rw-r--r--share/doc/papers/px/Makefile15
-rw-r--r--share/doc/papers/px/fig1.1.n71
-rw-r--r--share/doc/papers/px/fig1.2.n68
-rw-r--r--share/doc/papers/px/fig1.3.n60
-rw-r--r--share/doc/papers/px/fig2.3.raw103
-rw-r--r--share/doc/papers/px/fig2.4.n57
-rw-r--r--share/doc/papers/px/fig3.2.n56
-rw-r--r--share/doc/papers/px/fig3.3.n57
-rw-r--r--share/doc/papers/px/pxin0.n140
-rw-r--r--share/doc/papers/px/pxin1.n538
-rw-r--r--share/doc/papers/px/pxin2.n923
-rw-r--r--share/doc/papers/px/pxin3.n597
-rw-r--r--share/doc/papers/px/pxin4.n67
-rw-r--r--share/doc/papers/px/table2.1.n83
-rw-r--r--share/doc/papers/px/table2.2.n85
-rw-r--r--share/doc/papers/px/table2.3.n45
-rw-r--r--share/doc/papers/px/table3.1.n47
-rw-r--r--share/doc/papers/px/tmac.p113
-rw-r--r--share/doc/papers/relengr/0.t91
-rw-r--r--share/doc/papers/relengr/1.t69
-rw-r--r--share/doc/papers/relengr/2.t146
-rw-r--r--share/doc/papers/relengr/3.t390
-rw-r--r--share/doc/papers/relengr/Makefile12
-rw-r--r--share/doc/papers/relengr/ref.bib26
-rw-r--r--share/doc/papers/relengr/ref.bib.ig3
-rw-r--r--share/doc/papers/relengr/spell.ok15
-rw-r--r--share/doc/papers/relengr/tmac.srefs179
-rw-r--r--share/doc/papers/sysperf/0.t247
-rw-r--r--share/doc/papers/sysperf/1.t81
-rw-r--r--share/doc/papers/sysperf/2.t258
-rw-r--r--share/doc/papers/sysperf/3.t694
-rw-r--r--share/doc/papers/sysperf/4.t774
-rw-r--r--share/doc/papers/sysperf/5.t285
-rw-r--r--share/doc/papers/sysperf/6.t70
-rw-r--r--share/doc/papers/sysperf/7.t164
-rw-r--r--share/doc/papers/sysperf/Makefile22
-rw-r--r--share/doc/papers/sysperf/a1.t668
-rw-r--r--share/doc/papers/sysperf/a2.t117
80 files changed, 16386 insertions, 0 deletions
diff --git a/share/doc/papers/beyond4.3/Makefile b/share/doc/papers/beyond4.3/Makefile
new file mode 100644
index 000000000000..f474dba9488d
--- /dev/null
+++ b/share/doc/papers/beyond4.3/Makefile
@@ -0,0 +1,7 @@
+# @(#)Makefile 5.2 (Berkeley) 6/8/93
+
+DIR= papers/beyond43
+SRCS= beyond43.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/beyond4.3/beyond43.ms b/share/doc/papers/beyond4.3/beyond43.ms
new file mode 100644
index 000000000000..f83e68adfa9b
--- /dev/null
+++ b/share/doc/papers/beyond4.3/beyond43.ms
@@ -0,0 +1,518 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)beyond43.ms 5.1 (Berkeley) 6/5/90
+.\"
+.\" *troff -ms
+.rm CM
+.sp 2
+.ce 100
+\fB\s+2Current Research by
+The Computer Systems Research Group
+of Berkeley\s-2\fP
+.ds DT "February 10, 1989
+.\" \fBDRAFT of \*(DT\fP
+.sp 2
+.nf
+Marshall Kirk McKusick
+Michael J Karels
+Keith Sklower
+Kevin Fall
+Marc Teitelbaum
+Keith Bostic
+.fi
+.sp 2
+.ce 1
+\fISummary\fP
+.ce 0
+.PP
+The release of 4.3BSD in April of 1986 addressed many of the
+performance problems and unfinished interfaces
+present in 4.2BSD [Leffler84] [McKusick85].
+The Computer Systems Research Group at Berkeley
+has now embarked on a new development phase to
+update other major components of the system, as well as to offer
+new functionality.
+There are five major ongoing projects.
+The first is to develop an OSI network protocol suite and to integrate
+existing ISO applications into Berkeley UNIX.
+The second is to develop and support an interface compliant with the
+P1003.1 POSIX standard recently approved by the IEEE.
+The third is to refine the TCP/IP networking to improve
+its performance and limit congestion on slow and/or lossy networks.
+The fourth is to provide a standard interface to file systems
+so that multiple local and remote file systems can be supported,
+much as multiple networking protocols are supported by 4.3BSD.
+The fifth is to evaluate alternate access control mechanisms and
+audit the existing security features of the system, particularly
+with respect to network services.
+Other areas of work include multi-architecture support,
+a general purpose kernel memory allocator, disk labels, and
+extensions to the 4.2BSD fast filesystem.
+.PP
+We are planning to finish implementation prototypes for each of the
+five main areas of work over the next year, and provide an informal
+test release sometime next year for interested developers.
+After incorporating feedback and refinements from the testers,
+they will appear in the next full Berkeley release, which is typically
+made about a year after the test release.
+.br
+.ne 10
+.sp 2
+.NH
+Recently Completed Projects
+.PP
+There have been several changes in the system that were included
+in the recent 4.3BSD Tahoe release.
+.NH 2
+Multi-architecture support
+.PP
+Support has been added for the DEC VAX 8600/8650, VAX 8200/8250,
+MicroVAXII and MicroVAXIII.
+.PP
+The largest change has been the incorporation of support for the first
+non-VAX processor, the CCI Power 6/32 and 6/32SX. (This addition also
+supports the
+Harris HCX-7 and HCX-9, as well as the Sperry 7000/40 and ICL machines.)
+The Power 6 version of 4.3BSD is largely based on the compilers and
+device drivers done for CCI's 4.2BSD UNIX,
+and is otherwise similar to the VAX release of 4.3BSD.
+The entire source tree, including all kernel and user-level sources,
+has been merged using a structure that will easily accommodate the addition
+of other processor families. A MIPS R2000 has been donated to us,
+making the MIPS architecture a likely candidate for inclusion into a future
+BSD release.
+.NH 2
+Kernel Memory Allocator
+.PP
+The 4.3BSD UNIX kernel used 10 different memory allocation mechanisms,
+each designed for the particular needs of the utilizing subsystem.
+These mechanisms have been replaced by a general purpose dynamic
+memory allocator that can be used by all of the kernel subsystems.
+The design of this allocator takes advantage of known memory usage
+patterns in the UNIX kernel and a hybrid strategy that is time-efficient
+for small allocations and space-efficient for large allocations.
+This allocator replaces the multiple memory allocation interfaces
+with a single easy-to-program interface,
+results in more efficient use of global memory by eliminating
+partitioned and specialized memory pools,
+and is quick enough (approximately 15 VAX instructions) that no
+performance loss is observed relative to the current implementations.
+[McKusick88].
+.NH 2
+Disk Labels
+.PP
+During the work on the CCI machine,
+it became obvious that disk geometry and filesystem layout information
+must be stored on each disk in a pack label.
+Disk labels were implemented for the CCI disks and for the most common
+types of disk controllers on the VAX.
+A utility was written to create and maintain the disk information,
+and other user-level programs that use such information now obtain
+it from the disk label.
+The use of this facility has allowed improvements in the file system's
+knowledge of irregular disk geometries such as track-to-track skew.
+.NH 2
+Fat Fast File System
+.PP
+The 4.2 fast file sytem [McKusick84]
+contained several statically sized structures,
+imposing limits on the number of cylinders per cylinder group,
+inodes per cylinder group,
+and number of distinguished rotational positions.
+The new ``fat'' filesystem allows these limits to be set at filesystem
+creation time.
+Old kernels will treat the new filesystems as read-only,
+and new kernels
+will accomodate both formats.
+The filesystem check facility, \fCfsck\fP, has also been modified to check
+either type.
+.br
+.ne 10
+.sp 2
+.NH
+Current UNIX Research at Berkeley
+.PP
+Since the release of 4.3BSD in mid 1986,
+we have begun work on several new major areas of research.
+Our goal is to apply leading edge research ideas into a stable
+and reliable implementation that solves current problems in
+operating systems development.
+.NH 2
+OSI network protocol development
+.PP
+The network architecture of 4.2BSD was designed to accommodate
+multiple network protocol families and address formats,
+and an implementation of the ISO OSI network protocols
+should enter into this framework without much difficulty.
+We plan to
+implement the OSI connectionless internet protocol (CLNP),
+and device drivers for X.25, 802.3, and possibly 802.5 interfaces, and
+to integrate these with an OSI transport class 4 (TP-4) implementation.
+We will also incorporate into the Berkeley Software Distribution an
+updated ISO Development Environment (ISODE)
+featuring International Standard (IS) versions of utilities.
+ISODE implements the session and presentation layers of the OSI protocol suite,
+and will include an implementation of the file transfer protocol (FTAM).
+It is also possible that an X.400 implementation now being done at
+University College, London and the University of Nottingham
+will be available for testing and distribution.
+.LP
+This implementation is comprised of four areas.
+.IP 1)
+We are updating the University of
+Wisconsin TP-4 to match GOSIP requirements.
+The University of Wisconsin developed a transport class 4
+implementation for the 4.2BSD kernel under contract to Mitre.
+This implementation must be updated to reflect the National Institute
+of Standards and Technology (NIST, formerly NBS) workshop agreements,
+GOSIP, and 4.3BSD requirements.
+We will make this TP-4 operate with an OSI IP,
+as the original implementation was built to run over the DoD IP.
+.IP 2)
+A kernel version of the OSI IP and ES-IS protocols must be produced.
+We will implement the kernel version of these protocols.
+.IP 3)
+The required device drivers need to be integrated into a BSD kernel.
+4.3BSD has existing device drivers for many ethernet devices; future
+BSD versions may also support X.25 devices as well as token ring
+networks.
+These device drivers must be integrated
+into the kernel OSI protocol implementations.
+.IP 4)
+The existing OSINET interoperability test network is available so
+that the interoperability of the ISODE and BSD kernel protocols
+can be established through tests with several vendors.
+Testing is crucial because an openly available version of GOSIP protocols
+that does not interoperate with DEC, IBM, SUN, ICL, HIS, and other
+major vendors would be embarrassing.
+To allow testing of the integrated pieces the most desirable
+approach is to provide access to OSINET at UCB.
+A second approach is to do the interoperability testing at
+the site of an existing OSINET member, such as the NBS.
+.NH 2
+Compliance with POSIX 1003
+.PP
+Berkeley became involved several months ago in the development
+of the IEEE POSIX P1003.1 system interface standard.
+Since then, we have been parcipating in the working groups
+of P1003.2 (shell and application utility interface),
+P1003.6 (security), P1003.7 (system administration), and P1003.8
+(networking).
+.PP
+The IEEE published the POSIX P1003.1 standard in late 1988.
+POSIX related changes to the BSD system have included a new terminal
+driver, support for POSIX sessions and job control, expanded signal
+functionality, restructured directory access routines, and new set-user
+and set-group id facilities.
+We currently have a prototype implementation of the
+POSIX driver with extensions to provide binary compatibility with
+applications developed for the old Berkeley terminal driver.
+We also have a prototype implementation of the 4.2BSD-based POSIX
+job control facility.
+.PP
+The P1003.2 draft is currently being voted on by the IEEE
+P1003.2 balloting group.
+Berkeley is particularly interested in the results of this standard,
+as it will profoundly influence the user environment.
+The other groups are in comparatively early phases, with drafts
+coming to ballot sometime in the 90's.
+Berkeley will continue to participate in these groups, and
+move in the near future toward a P1003.1 and P1003.2 compliant
+system.
+We have many of the utilities outlined in the current P1003.2 draft
+already implemented, and have other parties willing to contribute
+additional implementations.
+.NH 2
+Improvements to the TCP/IP Networking Protocols
+.PP
+The Internet and the Berkeley collection of local-area networks
+have both grown at high rates in the last year.
+The Bay Area Regional Research Network (BARRNet),
+connecting several UC campuses, Stanford and NASA-Ames
+has recently become operational, increasing the complexity
+of the network connectivity.
+Both Internet and local routing algorithms are showing the strain
+of continued growth.
+We have made several changes in the local routing algorithm
+to keep accommodating the current topology,
+and are participating in the development of new routing algorithms
+and standard protocols.
+.PP
+Recent work in collaboration with Van Jacobson of the Lawrence Berkeley
+Laboratory has led to the design and implementation of several new algorithms
+for TCP that improve throughput on both local and long-haul networks
+while reducing unnecessary retransmission.
+The improvement is especially striking when connections must traverse
+slow and/or lossy networks.
+The new algorithms include ``slow-start,''
+a technique for opening the TCP flow control window slowly
+and using the returning stream of acknowledgements as a clock
+to drive the connection at the highest speed tolerated by the intervening
+network.
+A modification of this technique allows the sender to dynamically modify
+the send window size to adjust to changing network conditions.
+In addition, the round-trip timer has been modified to estimate the variance
+in round-trip time, thus allowing earlier retransmission of lost packets
+with less spurious retransmission due to increasing network delay.
+Along with a scheme proposed by Phil Karn of Bellcore,
+these changes reduce unnecessary retransmission over difficult paths
+such as Satnet by nearly two orders of magnitude
+while improving throughput dramatically.
+.PP
+The current TCP implementation is now being readied
+for more widespread distribution via the network and as a
+standard Berkeley distribution unencumbered by any commercial licensing.
+We are continuing to refine the TCP and IP implementations
+using the ARPANET, BARRNet, the NSF network
+and local campus nets as testbeds.
+In addition, we are incorporating applicable algorithms from this work
+into the TP-4 protocol implementation.
+.NH 2
+Toward a Compatible File System Interface
+.PP
+The most critical shortcoming of the 4.3BSD UNIX system was in the
+area of distributed file systems.
+As with networking protocols,
+there is no single distributed file system
+that provides sufficient speed and functionality for all problems.
+It is frequently necessary to support several different remote
+file system protocols, just as it is necessary to run several
+different network protocols.
+.PP
+As network or remote file systems have been implemented for UNIX,
+several stylized interfaces between the file system implementation
+and the rest of the kernel have been developed.
+Among these are Sun Microsystems' Virtual File System interface (VFS)
+using \fBvnodes\fP [Sandburg85] [Kleiman86],
+Digital Equipment's Generic File System (GFS) architecture [Rodriguez86],
+AT&T's File System Switch (FSS) [Rifkin86],
+the LOCUS distributed file system [Walker85],
+and Masscomp's extended file system [Cole85].
+Other remote file systems have been implemented in research or
+university groups for internal use,
+notably the network file system in the Eighth Edition UNIX
+system [Weinberger84] and two different file systems used at Carnegie Mellon
+University [Satyanarayanan85].
+Numerous other remote file access methods have been devised for use
+within individual UNIX processes,
+many of them by modifications to the C I/O library
+similar to those in the Newcastle Connection [Brownbridge82].
+.PP
+Each design attempts to isolate file system-dependent details
+below a generic interface and to provide a framework within which
+new file systems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each addresses somewhat different design goals,
+having been based on a different version of UNIX,
+having targeted a different set of file systems with varying characteristics,
+and having selected a different set of file system primitive operations.
+.PP
+Our effort in this area is aimed at providing a common framework to
+support these different distributed file systems simultaneously rather than to
+simply implement yet another protocol.
+This requires a detailed study of the existing protocols,
+and discussion with their implementors to determine whether
+they could modify their implementation to fit within our proposed
+framework. We have studied the various file system interfaces to determine
+their generality, completeness, robustness, efficiency, and aesthetics
+and are currently working on a file system interface
+that we believe includes the best features of
+each of the existing implementations.
+This work and the rationale underlying its development
+have been presented to major software vendors as an early step
+toward convergence on a standard compatible file system interface.
+Briefly, the proposal adopts the 4.3BSD calling convention for file
+name lookup but otherwise is closely related to Sun's VFS
+and DEC's GFS. [Karels86].
+.NH 2
+System Security
+.PP
+The recent invasion of the DARPA Internet by a quickly reproducing ``worm''
+highlighted the need for a thorough review of the access
+safeguards built into the system.
+Until now, we have taken a passive approach to dealing with
+weaknesses in the system access mechanisms, rather than actively
+searching for possible weaknesses.
+When we are notified of a problem or loophole in a system utility
+by one of our users,
+we have a well defined procedure for fixing the problem and
+expeditiously disseminating the fix to the BSD mailing list.
+This procedure has proven itself to be effective in
+solving known problems as they arise
+(witness its success in handling the recent worm).
+However, we feel that it would be useful to take a more active
+role in identifying problems before they are reported (or exploited).
+We will make a complete audit of the system
+utilities and network servers to find unintended system access mechanisms.
+.PP
+As a part of the work to make the system more resistant to attack
+from local users or via the network, it will be necessary to produce
+additional documentation on the configuration and operation of the system.
+This documentation will cover such topics as file and directory ownership
+and access, network and server configuration,
+and control of privileged operations such as file system backups.
+.PP
+We are investigating the addition of access control lists (ACLs) for
+filesystem objects.
+ACLs provide a much finer granularity of control over file access permissions
+than the current
+discretionary access control mechanism (mode bits).
+Furthermore, they are necessary
+in environments where C2 level security or better, as defined in the DoD
+TCSEC [DoD83], is required.
+The POSIX P1003.6 security group has made notable progress in determining
+how an ACL mechanism should work, and several vendors have implemented
+ACLs for their commercial systems.
+Berkeley will investigate the existing implementations and determine
+how to best integrate ACLs with the existing mechanism.
+.PP
+A major shortcoming of the present system is that authentication
+over the network is based solely on the privileged port mechanism
+between trusting hosts and users.
+Although privileged ports can only be created by processes running as root
+on a UNIX system,
+such processes are easy for a workstation user to obtain;
+they simply reboot their workstation in single user mode.
+Thus, a better authentication mechanism is needed.
+At present, we believe that the MIT Kerberos authentication
+server [Steiner88] provides the best solution to this problem.
+We propose to investigate Kerberos further as well as other
+authentication mechanisms and then to integrate
+the best one into Berkeley UNIX.
+Part of this integration would be the addition of the
+authentication mechanism into utilities such as
+telnet, login, remote shell, etc.
+We will add support for telnet (eventually replacing rlogin),
+the X window system, and the mail system within an authentication
+domain (a Kerberos \fIrealm\fP).
+We hope to replace the existing password authentication on each host
+with the network authentication system.
+.NH
+References
+.sp
+.IP Brownbridge82
+Brownbridge, D.R., L.F. Marshall, B. Randell,
+``The Newcastle Connection, or UNIXes of the World Unite!,''
+\fISoftware\- Practice and Experience\fP, Vol. 12, pp. 1147-1162, 1982.
+.sp
+.IP Cole85
+.br
+Cole, C.T., P.B. Flinn, A.B. Atlas,
+``An Implementation of an Extended File System for UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 131-150, June, 1985.
+.sp
+.IP DoD83
+.br
+Department of Defense,
+``Trusted Computer System Evaluation Criteria,''
+\fICSC-STD-001-83\fP,
+DoD Computer Security Center, August, 1983.
+.sp
+.IP Karels86
+Karels, M., M. McKusick,
+``Towards a Compatible File System Interface,''
+\fIProceedings of the European UNIX Users Group Meeting\fP,
+Manchester, England, pp. 481-496, September 1986.
+.sp
+.IP Kleiman86
+Kleiman, S.,
+``Vnodes: An Architecture for Multiple File System Types in Sun UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 238-247, June, 1986.
+.sp
+.IP Leffler84
+Leffler, S., M.K. McKusick, M. Karels,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 237-252, June, 1984.
+.sp
+.IP McKusick84
+McKusick, M.K., W. Joy, S. Leffler, R. Fabry,
+``A Fast File System for UNIX'',
+\fIACM Transactions on Computer Systems 2\fP, 3.
+pp 181-197, August 1984.
+.sp
+.IP McKusick85
+McKusick, M.K., M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 519-531, June, 1985.
+.sp
+.IP McKusick86
+McKusick, M.K., M. Karels,
+``A New Virtual Memory Implementation for Berkeley UNIX,''
+\fIProceedings of the European UNIX Users Group Meeting\fP,
+Manchester, England, pp. 451-460, September 1986.
+.sp
+.IP McKusick88
+McKusick, M.K., M. Karels,
+``Design of a General Purpose Memory Allocator for the 4.3BSD UNIX Kernel,''
+\fIUsenix Conference Proceedings\fP,
+pp. 295-303, June, 1988.
+.sp
+.IP Rifkin86
+Rifkin, A.P., M.P. Forbes, R.L. Hamilton, M. Sabrio, S. Shah, K. Yueh,
+``RFS Architectural Overview,'' \fIUsenix Conference Proceedings\fP,
+pp. 248-259, June, 1986.
+.sp
+.IP Rodriguez86
+Rodriguez, R., M. Koehler, R. Hyde,
+``The Generic File System,''
+\fIUsenix Conference Proceedings\fP,
+pp. 260-269, June, 1986.
+.sp
+.IP Sandberg85
+Sandberg, R., D. Goldberg, S. Kleiman, D. Walsh, B. Lyon,
+``Design and Implementation of the Sun Network File System,''
+\fIUsenix Conference Proceedings\fP,
+pp. 119-130, June, 1985.
+.sp
+.IP Satyanarayanan85
+Satyanarayanan, M., \fIet al.\fP,
+``The ITC Distributed File System: Principles and Design,''
+\fIProc. 10th Symposium on Operating Systems Principles\fP, pp. 35-50,
+ACM, December, 1985.
+.sp
+.IP Steiner88
+Steiner, J., C. Newman, J. Schiller,
+``\fIKerberos:\fP An Authentication Service for Open Network Systems,''
+\fIUsenix Conference Proceedings\fP, pp. 191-202, February, 1988.
+.sp
+.IP Walker85
+Walker, B.J. and S.H. Kiser, ``The LOCUS Distributed File System,''
+\fIThe LOCUS Distributed System Architecture\fP,
+G.J. Popek and B.J. Walker, ed., The MIT Press, Cambridge, MA, 1985.
+.sp
+.IP Weinberger84
+Weinberger, P.J., ``The Version 8 Network File System,''
+\fIUsenix Conference presentation\fP,
+June, 1984.
diff --git a/share/doc/papers/diskperf/Makefile b/share/doc/papers/diskperf/Makefile
new file mode 100644
index 000000000000..4370f81698c1
--- /dev/null
+++ b/share/doc/papers/diskperf/Makefile
@@ -0,0 +1,11 @@
+# @(#)Makefile 6.3 (Berkeley) 6/8/93
+
+DIR= papers/diskperf
+SRCS= abs.ms motivation.ms equip.ms methodology.ms tests.ms results.ms \
+ conclusions.ms appendix.ms
+MACROS= -ms
+
+paper.ps: ${SRCS}
+ ${TBL} ${SRCS} | ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/diskperf/abs.ms b/share/doc/papers/diskperf/abs.ms
new file mode 100644
index 000000000000..a61104d5de48
--- /dev/null
+++ b/share/doc/papers/diskperf/abs.ms
@@ -0,0 +1,176 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)abs.ms 6.2 (Berkeley) 4/16/91
+.\"
+.if n .ND
+.TL
+Performance Effects of Disk Subsystem Choices
+for VAX\(dg Systems Running 4.2BSD UNIX*
+.sp
+Revised July 27, 1983
+.AU
+Bob Kridle
+.AI
+mt Xinu
+2560 9th Street
+Suite #312
+Berkeley, California 94710
+.AU
+Marshall Kirk McKusick\(dd
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, CA 94720
+.AB
+.FS
+\(dgVAX, UNIBUS, and MASSBUS are trademarks of Digital Equipment Corporation.
+.FE
+.FS
+* UNIX is a trademark of Bell Laboratories.
+.FE
+.FS
+\(ddThis work was supported under grants from
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.FE
+Measurements were made of the UNIX file system
+throughput for various I/O operations using the most attractive currently
+available Winchester disks and controllers attached to both the
+native busses (SBI/CMI) and the UNIBUS on both VAX 11/780s and VAX 11/750s.
+The tests were designed to highlight the performance of single
+and dual drive subsystems operating in the 4.2BSD
+.I
+fast file system
+.R
+environment.
+Many of the results of the tests were initially counter-intuitive
+and revealed several important aspects of the VAX implementations
+which were surprising to us.
+.PP
+The hardware used included two Fujitsu 2351A
+``Eagle''
+disk drives on each of two foreign-vendor disk controllers
+and two DEC RA-81 disk drives on a DEC UDA-50 disk controller.
+The foreign-vendor controllers were Emulex SC750, SC780
+and Systems Industries 9900 native bus interfaced controllers.
+The DEC UDA-50 controller is a UNIBUS interfaced, heavily buffered
+controller which is the first implementation of a new DEC storage
+system architecture, DSA.
+.PP
+One of the most important results of our testing was the correction
+of several timing parameters in our device handler for devices
+with an RH750/RH780 type interface and having high burst transfer
+rates.
+The correction of these parameters resulted in an increase in
+performance of over twenty percent in some cases.
+In addition, one of the controller manufacturers altered their bus
+arbitration scheme to produce another increase in throughput.
+.AE
+.LP
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH Performance
+.ds RH Contents
+.bp 1
+.\".if t .ds CF July 27, 1983
+.\".if t .ds LF CSRG TR/8
+.\".if t .ds RF Kridle, et. al.
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Motivation"
+.LP
+.sp .5v
+.nf
+.B "2. Equipment
+2.1. DEC UDA50 disk controller
+2.2. Emulex SC750/SC780 disk controllers
+2.3. Systems Industries 9900 disk controller
+2.4. DEC RA81 disk drives
+2.5. Fujitsu 2351A disk drives
+.LP
+.sp .5v
+.nf
+.B "3. Methodology
+.LP
+.sp .5v
+.nf
+.B "4. Tests
+.LP
+.sp .5v
+.nf
+.B "5. Results
+.LP
+.sp .5v
+.nf
+.B "6. Conclusions
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.LP
+.sp .5v
+.nf
+.B "Appendix A
+A.1. read_8192
+A.2. write_4096
+A.3. write_8192
+A.4. rewrite_8192
+.ds RH Motivation
+.af PN 1
+.bp 1
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/diskperf/appendix.ms b/share/doc/papers/diskperf/appendix.ms
new file mode 100644
index 000000000000..ccc487841ef8
--- /dev/null
+++ b/share/doc/papers/diskperf/appendix.ms
@@ -0,0 +1,98 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)appendix.ms 6.2 (Berkeley) 4/16/91
+.\"
+.nr H2 1
+.ds RH Appendix A
+.SH
+\s+2Appendix A\s0
+.SH
+read_8192
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 0);
+ for (i = 0; i < 1024; i++)
+ read(j, buf, BUFSIZ);
+}
+.DE
+.SH
+write_4096
+.DS
+#define BUFSIZ 4096
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = creat(argv[1], 0666);
+ for (i = 0; i < 2048; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
+.SH
+write_8192
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = creat(argv[1], 0666);
+ for (i = 0; i < 1024; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
+.bp
+.SH
+rewrite_8192
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 2);
+ for (i = 0; i < 1024; i++)
+ write(j, buf, BUFSIZ);
+}
+.DE
diff --git a/share/doc/papers/diskperf/conclusions.ms b/share/doc/papers/diskperf/conclusions.ms
new file mode 100644
index 000000000000..538110679cab
--- /dev/null
+++ b/share/doc/papers/diskperf/conclusions.ms
@@ -0,0 +1,127 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)conclusions.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Conclusions
+.NH
+Conclusions
+.PP
+Peak available throughput is only one criterion
+in most storage system purchasing decisions.
+Most of the VAX UNIX systems we are familiar with
+are not I/O bandwidth constrained.
+Nevertheless, an adequate disk bandwidth is necessary for
+good performance and especially to preserve snappy
+response time.
+All of the disk systems we tested provide more than
+adequate bandwidth for typical VAX UNIX system application.
+Perhaps in some I/O-intensive applications such as
+image processing, more consideration should be given
+to the peak throughput available.
+In most situations, we feel that other factors are more
+important in making a storage choice between the systems we
+tested.
+Cost, reliability, availability, and support are some of these
+factors.
+The maturity of the technology purchased must also be weighed
+against the future value and expandability of newer technologies.
+.PP
+Two important conclusions about storage systems in general
+can be drawn from these tests.
+The first is that buffering can be effective in smoothing
+the the effects of lower bus speeds and bus contention.
+Even though the UDA50 is located on the relatively slow
+UNIBUS, its performance is similar to controllers located on
+the faster processor busses.
+However, the SC780 with only one sector of buffering shows that
+little buffering is needed if the underlying bus is fast enough.
+.PP
+Placing more intelligence in the controller seems to hinder UNIX system
+performance more than it helps.
+Our profiling tests have indicated that UNIX spends about
+the same percentage of time in the SC780 driver and the UDA50 driver
+(about 10-14%).
+Normally UNIX uses a disk sort algorithm that separates reads and
+writes into two seek order queues.
+The read queue has priority over the write queue,
+since reads cause processes to block,
+while writes can be done asynchronously.
+This is particularly useful when generating large files,
+as it allows the disk allocator to read
+new disk maps and begin doing new allocations
+while the blocks allocated out of the previous map are written to disk.
+Because the UDA50 handles all block ordering,
+and because it keeps all requests in a single queue,
+there is no way to force the longer seek needed to get the next disk map.
+This disfunction causes all the writes to be done before the disk map read,
+which idles the disk until a new set of blocks can be allocated.
+.PP
+The additional functionality of the UDA50 controller that allows it
+to transfer simultaneously from two drives at once tends to make
+the two drive transfer tests run much more effectively.
+Tuning for the single drive case works more effectively in the two
+drive case than when controllers that cannot handle simultaneous
+transfers are used.
+.ds RH Acknowledgements
+.nr H2 1
+.sp 1
+.SH
+\s+2Acknowledgements\s0
+.PP
+We thank Paul Massigilia and Bill Grace
+of Digital Equipment Corp for helping us run our
+disk tests on their UDA50/RA81.
+We also thank Rich Notari and Paul Ritkowski
+of Emulex for making their machines available
+to us to run our tests of the SC780/Eagles.
+Dan McKinster, then of Systems Industries,
+arranged to make their equipment available for the tests.
+We appreciate the time provided by Bob Gross, Joe Wolf, and
+Sam Leffler on their machines to refine our benchmarks.
+Finally we thank our sponsors,
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.ds RH References
+.nr H2 1
+.sp 1
+.SH
+\s+2References\s0
+.LP
+.IP [McKusick83] 20
+M. McKusick, W. Joy, S. Leffler, R. Fabry,
+``A Fast File System for UNIX'',
+\fIACM Transactions on Computer Systems 2\fP, 3.
+pp 181-197, August 1984.
+.ds RH Appendix A
+.bp
diff --git a/share/doc/papers/diskperf/equip.ms b/share/doc/papers/diskperf/equip.ms
new file mode 100644
index 000000000000..264ea0494737
--- /dev/null
+++ b/share/doc/papers/diskperf/equip.ms
@@ -0,0 +1,177 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)equip.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Equipment
+.NH
+Equipment
+.PP
+Various combinations of the three manufacturers disk controllers,
+and two pairs of Winchester disk drives were tested on both
+VAX 11/780 and VAX 11/750 CPUs. The Emulex and Systems Industries
+disk controllers were interfaced to Fujitsu 2351A
+``Eagle''
+404 Megabyte disk drives.
+The DEC UDA50 disk controller was interfaced to two DEC RA81
+456 Megabyte Winchester disk drives.
+All three controllers were tested on the VAX 780 although
+only the Emulex and DEC controllers were benchmarked on the VAX 11/750.
+Systems Industries makes a VAX 11/750 CMI interface for
+their controller, but we did not have time to test this device.
+In addition, not all the storage systems were tested for
+two drive throughput.
+Each of the controllers and disk drives used in the benchmarks
+is described briefly below.
+.NH 2
+DEC UDA50 disk controller
+.PP
+This is a new controller design which is part of a larger, long range
+storage architecture referred to as
+``DSA''
+or \fBD\fRigital \fBS\fRtorage \fBA\fRrchetecture.
+An important aspect of DSA is migrating a large part
+of the storage management previously handled in the operating
+system to the storage system. Thus, the UDA50 is a much more
+intelligent controller than previous interfaces like the RH750 or
+RH780.
+The UDA50 handles all error correction.
+It also deals with most of the physical storage parameters.
+Typically, system software requests a logical block or
+sequence of blocks.
+The physical locations of these blocks,
+their head, track, and cylinder indices,
+are determined by the controller.
+The UDA50 also orders disk requests to maximize throughput
+where possible, minimizing total seek and rotational delays.
+Where multiple drives are attached to a single controller,
+the UDA50 can interleave
+simultaneous
+data transfers from multiple drives.
+.PP
+The UDA50 is a UNIBUS implementation of a DSA controller.
+It contains 52 sectors of internal buffering to minimize
+the effects of a slow UNIBUS such as the one on the VAX-11/780.
+This buffering also minimizes the effects of contention with
+other UNIBUS peripherals.
+.NH 2
+Emulex SC750/SC780 disk controllers
+.PP
+These two models of the same controller interface to the CMI bus
+of a VAX 11/750 and the SBI bus of a 11/VAX 780, respectively.
+To the operating system, they emulate either an RH750 or
+and RH780.
+The controllers install in the
+MASSBUS
+locations in the CPU cabinets and operate from the
+VAX power suplies.
+They provide an
+``SMD''
+or \fBS\fRtorage \fBM\fRodule \fBD\fRrive
+interface to the disk drives.
+Although a large number of disk drives use this interface, we tested
+the controller exclusively connected to Fujitsu 2351A disks.
+.PP
+The controller ws first implemented for the VAX-11/750 as the SC750
+model several years ago. Although the SC780 was introduced more
+recently, both are stable products with no bugs known to us.
+.NH 2
+System Industries 9900 disk controller
+.PP
+This controller is an evolution of the S.I. 9400 first introduced
+as a UNIBUS SMD interface.
+The 9900 has been enhanced to include an interface to the VAX 11/780 native
+bus, the SBI.
+It has also been upgraded to operate with higher data rate drives such
+as the Fujitsu 2351As we used in this test.
+The controller is contained in its own rack-mounted drawer with an integral
+power supply.
+The interface to the SMD is a four module set which mounts in a
+CPU cabinet slot normally occupied by an RH780.
+The SBI interface derives power from the VAX CPU cabinet power
+supplies.
+.NH 2
+DEC RA81 disk drives
+.PP
+The RA81 is a rack-mountable 456 Megabyte (formatted) Winchester
+disk drive manufactured by DEC.
+It includes a great deal of technology which is an integral part
+of the DEC \fBDSA\fR scheme.
+The novel technology includes a serial packet based communications
+protocol with the controller over a pair of mini-coaxial cables.
+The physical characteristics of the RA81 are shown in the
+table below:
+.DS
+.TS
+box,center;
+c s
+l l.
+DEC RA81 Disk Drive Characteristics
+_
+Peak Transfer Rate 2.2 Mbytes/sec.
+Rotational Speed 3,600 RPM
+Data Sectors/Track 51
+Logical Cylinders 1,248
+Logical Data Heads 14
+Data Capacity 456 Mbytes
+Minimum Seek Time 6 milliseconds
+Average Seek Time 28 milliseconds
+Maximum Seek Time 52 milliseconds
+.TE
+.DE
+.NH 2
+Fujitsu 2351A disk drives
+.PP
+The Fujitsu 2351A disk drive is a Winchester disk drive
+with an SMD controller interface.
+Fujitsu has developed a very good reputation for
+reliable storage products over the last several years.
+The 2351A has the following physical characteristics:
+.DS
+.TS
+box,center;
+c s
+l l.
+Fujitsu 2351A Disk Drive Characteristics
+_
+Peak Transfer Rate 1.859 Mbytes/sec.
+Rotational Speed 3,961 RPM
+Data Sectors/Track 48
+Cylinders 842
+Data Heads 20
+Data Capacity 404 Mbytes
+Minimum Seek Time 5 milliseconds
+Average Seek Time 18 milliseconds
+Maximum Seek Time 35 milliseconds
+.TE
+.DE
+.ds RH Methodology
+.bp
diff --git a/share/doc/papers/diskperf/methodology.ms b/share/doc/papers/diskperf/methodology.ms
new file mode 100644
index 000000000000..703d7b6f0545
--- /dev/null
+++ b/share/doc/papers/diskperf/methodology.ms
@@ -0,0 +1,111 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)methodology.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Methodology
+.NH
+Methodology
+.PP
+Our goal was to evaluate the performance of the target peripherals
+in an environment as much like our 4.2BSD UNIX systems as possible.
+There are two basic approaches to creating this kind of test environment.
+These might be termed the \fIindirect\fR and the \fIdirect\fR approach.
+The approach used by DEC in producing most of the performance data
+on the UDA50/RA81 system under VMS is what we term the indirect
+approach.
+We chose to use the direct approach.
+.PP
+The indirect approach used by DEC involves two steps.
+First, the environment in which performance is to be evaluated
+is parameterized.
+In this case, the disk I/O characteristics of VMS were measured
+as to the distribution of various sizes of accesses and the proportion
+of reads and writes.
+This parameterization of
+typical
+I/O activity was termed a
+``vax mix.''
+The second stage involves simulating this mixture of I/O activities
+with the devices to be tested and noting the total volume of transactions
+processed per unit time by each system.
+.PP
+The problems encountered with this indirect approach often
+have to do with the completeness and correctness of the parameterization
+of the context environment.
+For example, the
+``vax mix''
+model constructed for DECs tests uses a random distribution of seeks
+to the blocks read or written.
+It is not likely that any real system produces a distribution
+of disk transfer locations which is truly random and does not
+exhibit strong locality characteristics.
+.PP
+The methodology chosen by us is direct
+in the sense that it uses the standard structured file system mechanism present
+in the 4.2BSD UNIX operating system to create the sequence of locations
+and sizes of reads and writes to the benchmarked equipment.
+We simply create, write, and read
+files as they would be by user's activities.
+The disk space allocation and disk cacheing mechanism built into
+UNIX is used to produce the actual device reads and writes as well
+as to determine their size and location on the disk.
+We measure and compare the rate at which these
+.I
+user files
+.R
+can be written, rewritten, or read.
+.PP
+The advantage of this approach is the implicit accuracy in
+testing in the same environment in which the peripheral
+will be used.
+Although this system does not account for the I/O produced
+by some paging and swapping, in our memory rich environment
+these activities account for a relatively small portion
+of the total disk activity.
+.PP
+A more significant disadvantage to the direct approach
+is the occasional difficulty we have in accounting for our
+measured results.
+The apparently straight-forward activity of reading or writing a logical file
+on disk can produce a complex mixture of disk traffic.
+File I/O is supported by a file management system that
+buffers disk traffic through an internal cache,
+which allows writes to ba handled asynchronously.
+Reads must be done synchronously,
+however this restriction is moderated by the use of read-ahead.
+Small changes in the performance of the disk controller
+subsystem can result in large and unexpected
+changes in the file system performance,
+as it may change the characteristics of the memory contention
+experienced by the processor.
+.ds RH Tests
+.bp
diff --git a/share/doc/papers/diskperf/motivation.ms b/share/doc/papers/diskperf/motivation.ms
new file mode 100644
index 000000000000..2884fc59f52f
--- /dev/null
+++ b/share/doc/papers/diskperf/motivation.ms
@@ -0,0 +1,93 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)motivation.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Motivation
+.NH
+Motivation
+.PP
+These benchmarks were performed for several reasons.
+Foremost was our desire to obtain guideline to aid
+in choosing one the most expensive components of any
+VAX UNIX configuration, the disk storage system.
+The range of choices in this area has increased dramatically
+in the last year.
+DEC has become, with the introduction of the UDA50/RA81 system,
+cost competitive
+in the area of disk storage for the first time.
+Emulex's entry into the VAX 11/780 SBI controller
+field, the SC780, represented a important choice for us to examine, given
+our previous success with their VAX 11/750 SC750 controller and
+their UNIBUS controllers.
+The Fujitsu 2351A
+Winchester disk drive represents the lowest cost-per-byte disk storage
+known to us.
+In addition, Fujitsu's reputation for reliability was appealing.
+The many attractive aspects of these components justified a more
+careful examination of their performance aspects under UNIX.
+.PP
+In addition to the direct motivation of developing an effective
+choice of storage systems, we hoped to gain more insight into
+VAX UNIX file system and I/O performance in general.
+What generic characteristics of I/O subsystems are most
+important?
+How important is the location of the controller on the SBI/CMI versus
+the UNIBUS?
+Is extensive buffering in the controller essential or even important?
+How much can be gained by putting more of the storage system
+management and optimization function in the controller as
+DEC does with the UDA50?
+.PP
+We also wanted to resolve particular speculation about the value of
+storage system optimization by a controller in a UNIX
+environment.
+Is the access optimization as effective as that already provided
+by the existing 4.2BSD UNIX device handlers for traditional disks?
+VMS disk handlers do no seek optimization.
+This gives the UDA50 controller an advantage over other controllers
+under VMS which is not likely to be as important to UNIX.
+Are there penalties associated with greater intelligence in the controller?
+.PP
+A third and last reason for evaluating this equipment is comparable
+to the proverbial mountain climbers answer when asked why he climbs
+a particular mountain,
+``It was there.''
+In our case the equipment
+was there.
+We were lucky enough to assemble all the desired disks and controllers
+and get them installed on a temporarily idle VAX 11/780.
+This got us started collecting data.
+Although many of the tests were later rerun on a variety of other systems,
+this initial test bed was essential for working out the testing bugs
+and getting our feet wet.
+.ds RH Equipment
+.bp
diff --git a/share/doc/papers/diskperf/results.ms b/share/doc/papers/diskperf/results.ms
new file mode 100644
index 000000000000..09f61a81824f
--- /dev/null
+++ b/share/doc/papers/diskperf/results.ms
@@ -0,0 +1,337 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)results.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Results
+.NH
+Results
+.PP
+The following tables indicate the results of our
+test runs.
+Note that each table contains results for tests run
+on two varieties of 4.2BSD file systems.
+The first set of results is always for a file system
+with a basic blocking factor of eight Kilobytes and a
+fragment size of 1 Kilobyte. The second sets of measurements
+are for file systems with a four Kilobyte block size and a
+one Kilobyte fragment size.
+The values in parenthesis indicate the percentage of CPU
+time used by the test program.
+In the case of the two disk arm tests,
+the value in parenthesis indicates the sum of the percentage
+of the test programs that were run.
+Entries of ``n. m.'' indicate this value was not measured.
+.DS
+.TS
+box,center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBVAX 11/750\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC750/Eagle UDA50/RA81
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (69%) 620 (96%) 310 (44%) 520 (65%)
+write_4096 380 (99%) 370 (99%) 370 (97%) 360 (98%)
+write_8192 470 (99%) 470 (99%) 320 (71%) 410 (83%)
+rewrite_8192 650 (99%) 620 (99%) 310 (50%) 450 (70%)
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC750/Eagle UDA50/RA81
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 300 (60%) 400 (84%) 210 (42%) 340 (77%)
+write_4096 320 (98%) 320 (98%) 220 (67%) 290 (99%)
+write_8192 340 (98%) 340 (99%) 220 (65%) 310 (98%)
+rewrite_8192 450 (99%) 450 (98%) 230 (47%) 340 (78%)
+.TE
+.DE
+.PP
+Note that the rate of write operations on the VAX 11/750 are ultimately
+CPU limited in some cases.
+The write rates saturate the CPU at a lower bandwidth than the reads
+because they must do disk allocation in addition to moving the data
+from the user program to the disk.
+The UDA50/RA81 saturates the CPU at a lower transfer rate for a given
+operation than the SC750/Eagle because
+it causes more memory contention with the CPU.
+We do not know if this contention is caused by
+the UNIBUS controller or the UDA50.
+.PP
+The following table reports the results of test runs on a VAX 11/780
+with 4 Megabytes of main memory.
+.DS
+.TS
+box,center;
+c s s s s s s
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l l | l l | l l
+l | c c | c c | c c.
+4.2BSD File Systems Tests - \fBVAX 11/780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC780/Eagle UDA50/RA81 Sys. Ind. 9900/Eagle
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 560 (70%) 480 (58%) 360 (45%) 540 (72%) 340 (41%) 520 (66%)
+write_4096 440 (98%) 440 (98%) 380 (99%) 480 (96%) 490 (96%) 440 (84%)
+write_8192 490 (98%) 490 (98%) 220 (58%)* 480 (92%) 490 (80%) 430 (72%)
+rewrite_8192 760 (100%) 560 (72%) 220 (50%)* 180 (52%)* 490 (60%) 520 (62%)
+=
+.T&
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l l | l l | l l
+l | c c | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Emulex SC780/Eagle UDA50/RA81 Sys. Ind. 9900/Eagle
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (77%) 370 (66%) n.m. n.m. 200 (31%) 370 (56%)
+write_4096 380 (98%) 370 (98%) n.m. n.m. 200 (46%) 370 (88%)
+write_8192 380 (99%) 370 (97%) n.m. n.m. 200 (45%) 320 (76%)
+rewrite_8192 490 (87%) 350 (66%) n.m. n.m. 200 (31%) 300 (46%)
+.TE
+* the operation of the hardware was suspect during these tests.
+.DE
+.PP
+The dropoff in reading and writing rates for the two drive SC780/Eagle
+tests are probably due to the file system using insufficient
+rotational delay for these tests.
+We have not fully investigated these times.
+.PP
+The following table compares data rates on VAX 11/750s directly
+with those of VAX 11/780s using the UDA50/RA81 storage system.
+.DS
+.TS
+box,center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBDEC UDA50 - 750 vs. 780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 UNIBUS VAX 11/780 UNIBUS
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 310 (44%) 520 (84%) 360 (45%) 540 (72%)
+write_4096 370 (97%) 360 (100%) 380 (99%) 480 (96%)
+write_8192 320 (71%) 410 (96%) 220 (58%)* 480 (92%)
+rewrite_8192 310 (50%) 450 (80%) 220 (50%)* 180 (52%)*
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 UNIBUS VAX 11/780 UNIBUS
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 210 (42%) 342 (77%) n.m. n.m.
+write_4096 215 (67%) 294 (99%) n.m. n.m.
+write_8192 215 (65%) 305 (98%) n.m. n.m.
+rewrite_8192 227 (47%) 336 (78%) n.m. n.m.
+.TE
+* the operation of the hardware was suspect during these tests.
+.DE
+.PP
+The higher throughput available on VAX 11/780s is due to a number
+of factors.
+The larger main memory size allows a larger file system cache.
+The block allocation routines run faster, raising the upper limit
+on the data rates in writing new files.
+.PP
+The next table makes the same comparison using an Emulex controller
+on both systems.
+.DS
+.TS
+box, center;
+c s s s s
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+4.2BSD File Systems Tests - \fBEmulex - 750 vs. 780\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 CMI Bus VAX 11/780 SBI Bus
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 490 (69%) 620 (96%) 560 (70%) 480 (58%)
+write_4096 380 (99%) 370 (99%) 440 (98%) 440 (98%)
+write_8192 470 (99%) 470 (99%) 490 (98%) 490 (98%)
+rewrite_8192 650 (99%) 620 (99%) 760 (100%) 560 (72%)
+=
+.T&
+c s s s s
+c s s s s
+l | l s | l s
+l | l s | l s
+l | l l | l l
+l | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test VAX 11/750 CMI Bus VAX 11/780 SBI Bus
+
+ 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 300 (60%) 400 (84%) 490 (77%) 370 (66%)
+write_4096 320 (98%) 320 (98%) 380 (98%) 370 (98%)
+write_8192 340 (98%) 340 (99%) 380 (99%) 370 (97%)
+rewrite_8192 450 (99%) 450 (98%) 490 (87%) 350 (66%)
+.TE
+.DE
+.PP
+The following table illustrates the evolution of our testing
+process as both hardware and software problems effecting
+the performance of the Emulex SC780 were corrected.
+The software change was suggested to us by George Goble
+of Purdue University.
+.PP
+The 4.2BSD handler for RH750/RH780 interfaced disk drives
+contains several constants which to determine how
+much time is provided between an interrupt signaling the completion
+of a positioning command and the subsequent start of a data transfer
+operation. These lead times are expressed as sectors of rotational delay.
+If they are too small, an extra complete rotation will often be required
+between a seek and subsequent read or write operation.
+The higher bit rate and rotational speed of the 2351A Fujitsu
+disk drives required
+increasing these constants.
+.PP
+The hardware change involved allowing for slightly longer
+delays in arbitrating for cycles on the SBI bus by
+starting the bus arbitration cycle a little further ahead of
+when the data was ready for transfer.
+Finally we had to increase the rotational delay between consecutive
+blocks in the file because
+the higher bandwidth from the disk generated more memory contention,
+which slowed down the processor.
+.DS
+.TS
+box,center,expand;
+c s s s s s s
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l s | l s | l s
+l | c c | c c | c c
+l | c c | c c | c c.
+4.2BSD File Systems Tests - \fBEmulex SC780 Disk Controller Evolution\fR
+=
+Logically Sequential Transfers
+from an \fB8K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Inadequate Search Lead OK Search Lead OK Search Lead
+ Initial SBI Arbitration Init SBI Arb. Improved SBI Arb.
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 320 370 440 (60%) n.m. 560 (70%) 480 (58%)
+write_4096 250 270 300 (63%) n.m. 440 (98%) 440 (98%)
+write_8192 250 280 340 (60%) n.m. 490 (98%) 490 (98%)
+rewrite_8192 250 290 380 (48%) n.m. 760 (100%) 560 (72%)
+=
+.T&
+c s s s s s s
+c s s s s s s
+l | l s | l s | l s
+l | l s | l s | l s
+l | l s | l s | l s
+l | c c | c c | c c
+l | c c | c c | c c.
+Logically Sequential Transfers
+from an \fB4K/1K\fR 4.2BSD File System (Kbytes/sec.)
+_
+Test Inadequate Search Lead OK Search Lead OK Search Lead
+ Initial SBI Arbitration Init SBI Arb. Improved SBI Arb.
+
+ 1 Drive 2 Drives 1 Drive 2 Drives 1 Drive 2 Drives
+_
+read_8192 200 220 280 n.m. 490 (77%) 370 (66%)
+write_4096 180 190 300 n.m. 380 (98%) 370 (98%)
+write_8192 180 200 320 n.m. 380 (99%) 370 (97%)
+rewrite_8192 190 200 340 n.m. 490 (87%) 350 (66%)
+.TE
+.DE
+.ds RH Conclusions
+.bp
diff --git a/share/doc/papers/diskperf/tests.ms b/share/doc/papers/diskperf/tests.ms
new file mode 100644
index 000000000000..1809afec943d
--- /dev/null
+++ b/share/doc/papers/diskperf/tests.ms
@@ -0,0 +1,108 @@
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)tests.ms 6.2 (Berkeley) 4/16/91
+.\"
+.ds RH Tests
+.NH
+Tests
+.PP
+Our battery of tests consists of four programs,
+read_8192, write_8192, write_4096
+and rewrite_8192 originally written by [McKusick83]
+to evaluate the performance of the new file system in 4.2BSD.
+These programs all follow the the same model and are typified by
+read_8192 shown here.
+.DS
+#define BUFSIZ 8192
+main( argc, argv)
+char **argv;
+{
+ char buf[BUFSIZ];
+ int i, j;
+
+ j = open(argv[1], 0);
+ for (i = 0; i < 1024; i++)
+ read(j, buf, BUFSIZ);
+}
+.DE
+The remaining programs are included in appendix A.
+.PP
+These programs read, write with two different blocking factors,
+and rewrite logical files in structured file system on the disk
+under test.
+The write programs create new files while the rewrite program
+overwrites an existing file.
+Each of these programs represents an important segment of the
+typical UNIX file system activity with the read program
+representing by far the largest class and the rewrite the smallest.
+.PP
+A blocking factor of 8192 is used by all programs except write_4096.
+This is typical of most 4.2BSD user programs since a standard set of
+I/O support routines is commonly used and these routines buffer
+data in similar block sizes.
+.PP
+For each test run, a empty eight Kilobyte block
+file system was created in the target
+storage system.
+Then each of the four tests was run and timed.
+Each test was run three times;
+the first to clear out any useful data in the cache,
+and the second two to insure that the experiment
+had stablized and was repeatable.
+Each test operated on eight Megabytes of data to
+insure that the cache did not overly influence the results.
+Another file system was then initialized using a
+basic blocking factor of four Kilobytes and the same tests
+were run again and timed.
+A command script for a run appears as follows:
+.DS
+#!/bin/csh
+set time=2
+echo "8K/1K file system"
+newfs /dev/rhp0g eagle
+mount /dev/hp0g /mnt0
+mkdir /mnt0/foo
+echo "write_8192 /mnt0/foo/tst2"
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+rm -f /mnt0/foo/tst2
+write_8192 /mnt0/foo/tst2
+echo "read_8192 /mnt0/foo/tst2"
+read_8192 /mnt0/foo/tst2
+read_8192 /mnt0/foo/tst2
+read_8192 /mnt0/foo/tst2
+umount /dev/hp0g
+.DE
+.ds RH Results
+.bp
diff --git a/share/doc/papers/fsinterface/Makefile b/share/doc/papers/fsinterface/Makefile
new file mode 100644
index 000000000000..cb1d91398508
--- /dev/null
+++ b/share/doc/papers/fsinterface/Makefile
@@ -0,0 +1,7 @@
+# @(#)Makefile 5.3 (Berkeley) 6/8/93
+
+DIR= papers/fsinterface
+SRCS= fsinterface.ms
+MACROS= -ms
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/fsinterface/abstract.ms b/share/doc/papers/fsinterface/abstract.ms
new file mode 100644
index 000000000000..ab8b473170e1
--- /dev/null
+++ b/share/doc/papers/fsinterface/abstract.ms
@@ -0,0 +1,73 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)abstract.ms 5.2 (Berkeley) 4/16/91
+.\"
+.TL
+Toward a Compatible Filesystem Interface
+.AU
+Michael J. Karels
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.LP
+As network or remote filesystems have been implemented for
+.UX ,
+several stylized interfaces between the filesystem implementation
+and the rest of the kernel have been developed.
+Notable among these are Sun Microsystems' virtual filesystem interface
+using vnodes, Digital Equipment's Generic File System architecture,
+and AT&T's File System Switch.
+Each design attempts to isolate filesystem-dependent details
+below the generic interface and to provide a framework within which
+new filesystems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each of them addresses somewhat different design goals.
+Each was based upon a different starting version of
+.UX ,
+targetted a different set of filesystems with varying characteristics,
+and uses a different set of primitive operations provided by the filesystem.
+The current study compares the various filesystem interfaces.
+Criteria for comparison include generality, completeness, robustness,
+efficiency and esthetics.
+As a result of this comparison, a proposal for a new filesystem interface
+is advanced that includes the best features of the existing implementations.
+The proposal adopts the calling convention for name lookup introduced
+in 4.3BSD.
+A prototype implementation is described.
+This proposal and the rationale underlying its development
+have been presented to major software vendors
+as an early step toward convergence upon a compatible filesystem interface.
diff --git a/share/doc/papers/fsinterface/fsinterface.ms b/share/doc/papers/fsinterface/fsinterface.ms
new file mode 100644
index 000000000000..c5722e6d8e96
--- /dev/null
+++ b/share/doc/papers/fsinterface/fsinterface.ms
@@ -0,0 +1,1176 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fsinterface.ms 1.4 (Berkeley) 4/16/91
+.\"
+.if \nv .rm CM
+.de UX
+.ie \\n(UX \s-1UNIX\s0\\$1
+.el \{\
+\s-1UNIX\s0\\$1\(dg
+.FS
+\(dg \s-1UNIX\s0 is a registered trademark of AT&T.
+.FE
+.nr UX 1
+.\}
+..
+.TL
+Toward a Compatible Filesystem Interface
+.AU
+Michael J. Karels
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+.LP
+As network or remote filesystems have been implemented for
+.UX ,
+several stylized interfaces between the filesystem implementation
+and the rest of the kernel have been developed.
+.FS
+This is an update of a paper originally presented
+at the September 1986 conference of the European
+.UX
+Users' Group.
+Last modified April 16, 1991.
+.FE
+Notable among these are Sun Microsystems' Virtual Filesystem interface (VFS)
+using vnodes, Digital Equipment's Generic File System (GFS) architecture,
+and AT&T's File System Switch (FSS).
+Each design attempts to isolate filesystem-dependent details
+below a generic interface and to provide a framework within which
+new filesystems may be incorporated.
+However, each of these interfaces is different from
+and incompatible with the others.
+Each of them addresses somewhat different design goals.
+Each was based on a different starting version of
+.UX ,
+targetted a different set of filesystems with varying characteristics,
+and uses a different set of primitive operations provided by the filesystem.
+The current study compares the various filesystem interfaces.
+Criteria for comparison include generality, completeness, robustness,
+efficiency and esthetics.
+Several of the underlying design issues are examined in detail.
+As a result of this comparison, a proposal for a new filesystem interface
+is advanced that includes the best features of the existing implementations.
+The proposal adopts the calling convention for name lookup introduced
+in 4.3BSD, but is otherwise closely related to Sun's VFS.
+A prototype implementation is now being developed at Berkeley.
+This proposal and the rationale underlying its development
+have been presented to major software vendors
+as an early step toward convergence on a compatible filesystem interface.
+.AE
+.SH
+Introduction
+.PP
+As network communications and workstation environments
+became common elements in
+.UX
+systems, several vendors of
+.UX
+systems have designed and built network file systems
+that allow client process on one
+.UX
+machine to access files on a server machine.
+Examples include Sun's Network File System, NFS [Sandberg85],
+AT&T's recently-announced Remote File Sharing, RFS [Rifkin86],
+the LOCUS distributed filesystem [Walker85],
+and Masscomp's extended filesystem [Cole85].
+Other remote filesystems have been implemented in research or university groups
+for internal use, notably the network filesystem in the Eighth Edition
+.UX
+system [Weinberger84] and two different filesystems used at Carnegie-Mellon
+University [Satyanarayanan85].
+Numerous other remote file access methods have been devised for use
+within individual
+.UX
+processes,
+many of them by modifications to the C I/O library
+similar to those in the Newcastle Connection [Brownbridge82].
+.PP
+Multiple network filesystems may frequently
+be found in use within a single organization.
+These circumstances make it highly desirable to be able to transport filesystem
+implementations from one system to another.
+Such portability is considerably enhanced by the use of a stylized interface
+with carefully-defined entry points to separate the filesystem from the rest
+of the operating system.
+This interface should be similar to the interface between device drivers
+and the kernel.
+Although varying somewhat among the common versions of
+.UX ,
+the device driver interfaces are sufficiently similar that device drivers
+may be moved from one system to another without major problems.
+A clean, well-defined interface to the filesystem also allows a single
+system to support multiple local filesystem types.
+.PP
+For reasons such as these, several filesystem interfaces have been used
+when integrating new filesystems into the system.
+The best-known of these are Sun Microsystems' Virtual File System interface,
+VFS [Kleiman86], and AT&T's File System Switch, FSS.
+Another interface, known as the Generic File System, GFS,
+has been implemented for the ULTRIX\(dd
+.FS
+\(dd ULTRIX is a trademark of Digital Equipment Corp.
+.FE
+system by Digital [Rodriguez86].
+There are numerous differences among these designs.
+The differences may be understood from the varying philosophies
+and design goals of the groups involved, from the systems under which
+the implementations were done, and from the filesystems originally targetted
+by the designs.
+These differences are summarized in the following sections
+within the limitations of the published specifications.
+.SH
+Design goals
+.PP
+There are several design goals which, in varying degrees,
+have driven the various designs.
+Each attempts to divide the filesystem into a filesystem-type-independent
+layer and individual filesystem implementations.
+The division between these layers occurs at somewhat different places
+in these systems, reflecting different views of the diversity and types
+of the filesystems that may be accommodated.
+Compatibility with existing local filesystems has varying importance;
+at the user-process level, each attempts to be completely transparent
+except for a few filesystem-related system management programs.
+The AT&T interface also makes a major effort to retain familiar internal
+system interfaces, and even to retain object-file-level binary compatibility
+with operating system modules such as device drivers.
+Both Sun and DEC were willing to change internal data structures and interfaces
+so that other operating system modules might require recompilation
+or source-code modification.
+.PP
+AT&T's interface both allows and requires filesystems to support the full
+and exact semantics of their previous filesystem,
+including interruptions of system calls on slow operations.
+System calls that deal with remote files are encapsulated
+with their environment and sent to a server where execution continues.
+The system call may be aborted by either client or server, returning
+control to the client.
+Most system calls that descend into the file-system dependent layer
+of a filesystem other than the standard local filesystem do not return
+to the higher-level kernel calling routines.
+Instead, the filesystem-dependent code completes the requested
+operation and then executes a non-local goto (\fIlongjmp\fP) to exit the
+system call.
+These efforts to avoid modification of main-line kernel code
+indicate a far greater emphasis on internal compatibility than on modularity,
+clean design, or efficiency.
+.PP
+In contrast, the Sun VFS interface makes major modifications to the internal
+interfaces in the kernel, with a very clear separation
+of filesystem-independent and -dependent data structures and operations.
+The semantics of the filesystem are largely retained for local operations,
+although this is achieved at some expense where it does not fit the internal
+structuring well.
+The filesystem implementations are not required to support the same
+semantics as local
+.UX
+filesystems.
+Several historical features of
+.UX
+filesystem behavior are difficult to achieve using the VFS interface,
+including the atomicity of file and link creation and the use of open files
+whose names have been removed.
+.PP
+A major design objective of Sun's network filesystem,
+statelessness,
+permeates the VFS interface.
+No locking may be done in the filesystem-independent layer,
+and locking in the filesystem-dependent layer may occur only during
+a single call into that layer.
+.PP
+A final design goal of most implementors is performance.
+For remote filesystems,
+this goal tends to be in conflict with the goals of complete semantic
+consistency, compatibility and modularity.
+Sun has chosen performance over modularity in some areas,
+but has emphasized clean separation of the layers within the filesystem
+at the expense of performance.
+Although the performance of RFS is yet to be seen,
+AT&T seems to have considered compatibility far more important than modularity
+or performance.
+.SH
+Differences among filesystem interfaces
+.PP
+The existing filesystem interfaces may be characterized
+in several ways.
+Each system is centered around a few data structures or objects,
+along with a set of primitives for performing operations upon these objects.
+In the original
+.UX
+filesystem [Ritchie74],
+the basic object used by the filesystem is the inode, or index node.
+The inode contains all of the information about a file except its name:
+its type, identification, ownership, permissions, timestamps and location.
+Inodes are identified by the filesystem device number and the index within
+the filesystem.
+The major entry points to the filesystem are \fInamei\fP,
+which translates a filesystem pathname into the underlying inode,
+and \fIiget\fP, which locates an inode by number and installs it in the in-core
+inode table.
+\fINamei\fP performs name translation by iterative lookup
+of each component name in its directory to find its inumber,
+then using \fIiget\fP to return the actual inode.
+If the last component has been reached, this inode is returned;
+otherwise, the inode describes the next directory to be searched.
+The inode returned may be used in various ways by the caller;
+it may be examined, the file may be read or written,
+types and access may be checked, and fields may be modified.
+Modified inodes are automatically written back the the filesystem
+on disk when the last reference is released with \fIiput\fP.
+Although the details are considerably different,
+the same general scheme is used in the faster filesystem in 4.2BSD
+.UX
+[Mckusick85].
+.PP
+Both the AT&T interface and, to a lesser extent, the DEC interface
+attempt to preserve the inode-oriented interface.
+Each modify the inode to allow different varieties of the structure
+for different filesystem types by separating the filesystem-dependent
+parts of the inode into a separate structure or one arm of a union.
+Both interfaces allow operations
+equivalent to the \fInamei\fP and \fIiget\fP operations
+of the old filesystem to be performed in the filesystem-independent
+layer, with entry points to the individual filesystem implementations to support
+the type-specific parts of these operations. Implicit in this interface
+is that files may be conveniently be named by and located using a single
+index within a filesystem.
+The GFS provides specific entry points to the filesystems
+to change most file properties rather than allowing arbitrary changes
+to be made to the generic part of the inode.
+.PP
+In contrast, the Sun VFS interface replaces the inode as the primary object
+with the vnode.
+The vnode contains no filesystem-dependent fields except the pointer
+to the set of operations implemented by the filesystem.
+Properties of a vnode that might be transient, such as the ownership,
+permissions, size and timestamps, are maintained by the lower layer.
+These properties may be presented in a generic format upon request;
+callers are expected not to hold this information for any length of time,
+as they may not be up-to-date later on.
+The vnode operations do not include a corollary for \fIiget\fP;
+the only external interface for obtaining vnodes for specific files
+is the name lookup operation.
+(Separate procedures are provided outside of this interface
+that obtain a ``file handle'' for a vnode which may be given
+to a client by a server, such that the vnode may be retrieved
+upon later presentation of the file handle.)
+.SH
+Name translation issues
+.PP
+Each of the systems described include a mechanism for performing
+pathname-to-internal-representation translation.
+The style of the name translation function is very different in all
+three systems.
+As described above, the AT&T and DEC systems retain the \fInamei\fP function.
+The two are quite different, however, as the ULTRIX interface uses
+the \fInamei\fP calling convention introduced in 4.3BSD.
+The parameters and context for the name lookup operation
+are collected in a \fInameidata\fP structure which is passed to \fInamei\fP
+for operation.
+Intent to create or delete the named file is declared in advance,
+so that the final directory scan in \fInamei\fP may retain information
+such as the offset in the directory at which the modification will be made.
+Filesystems that use such mechanisms to avoid redundant work
+must therefore lock the directory to be modified so that it may not
+be modified by another process before completion.
+In the System V filesystem, as in previous versions of
+.UX ,
+this information is stored in the per-process \fIuser\fP structure
+by \fInamei\fP for use by a low-level routine called after performing
+the actual creation or deletion of the file itself.
+In 4.3BSD and in the GFS interface, these side effects of \fInamei\fP
+are stored in the \fInameidata\fP structure given as argument to \fInamei\fP,
+which is also presented to the routine implementing file creation or deletion.
+.PP
+The ULTRIX \fInamei\fP routine is responsible for the generic
+parts of the name translation process, such as copying the name into
+an internal buffer, validating it, interpolating
+the contents of symbolic links, and indirecting at mount points.
+As in 4.3BSD, the name is copied into the buffer in a single call,
+according to the location of the name.
+After determining the type of the filesystem at the start of translation
+(the current directory or root directory), it calls the filesystem's
+\fInamei\fP entry with the same structure it received from its caller.
+The filesystem-specific routine translates the name, component by component,
+as long as no mount points are reached.
+It may return after any number of components have been processed.
+\fINamei\fP performs any processing at mount points, then calls
+the correct translation routine for the next filesystem.
+Network filesystems may pass the remaining pathname to a server for translation,
+or they may look up the pathname components one at a time.
+The former strategy would be more efficient,
+but the latter scheme allows mount points within a remote filesystem
+without server knowledge of all client mounts.
+.PP
+The AT&T \fInamei\fP interface is presumably the same as that in previous
+.UX
+systems, accepting the name of a routine to fetch pathname characters
+and an operation (one of: lookup, lookup for creation, or lookup for deletion).
+It translates, component by component, as before.
+If it detects that a mount point crosses to a remote filesystem,
+it passes the remainder of the pathname to the remote server.
+A pathname-oriented request other than open may be completed
+within the \fInamei\fP call,
+avoiding return to the (unmodified) system call handler
+that called \fInamei\fP.
+.PP
+In contrast to the first two systems, Sun's VFS interface has replaced
+\fInamei\fP with \fIlookupname\fP.
+This routine simply calls a new pathname-handling module to allocate
+a pathname buffer and copy in the pathname (copying a character per call),
+then calls \fIlookuppn\fP.
+\fILookuppn\fP performs the iteration over the directories leading
+to the destination file; it copies each pathname component to a local buffer,
+then calls the filesystem \fIlookup\fP entry to locate the vnode
+for that file in the current directory.
+Per-filesystem \fIlookup\fP routines may translate only one component
+per call.
+For creation and deletion of new files, the lookup operation is unmodified;
+the lookup of the final component only serves to check for the existence
+of the file.
+The subsequent creation or deletion call, if any, must repeat the final
+name translation and associated directory scan.
+For new file creation in particular, this is rather inefficient,
+as file creation requires two complete scans of the directory.
+.PP
+Several of the important performance improvements in 4.3BSD
+were related to the name translation process [McKusick85][Leffler84].
+The following changes were made:
+.IP 1. 4
+A system-wide cache of recent translations is maintained.
+The cache is separate from the inode cache, so that multiple names
+for a file may be present in the cache.
+The cache does not hold ``hard'' references to the inodes,
+so that the normal reference pattern is not disturbed.
+.IP 2.
+A per-process cache is kept of the directory and offset
+at which the last successful name lookup was done.
+This allows sequential lookups of all the entries in a directory to be done
+in linear time.
+.IP 3.
+The entire pathname is copied into a kernel buffer in a single operation,
+rather than using two subroutine calls per character.
+.IP 4.
+A pool of pathname buffers are held by \fInamei\fP, avoiding allocation
+overhead.
+.LP
+All of these performance improvements from 4.3BSD are well worth using
+within a more generalized filesystem framework.
+The generalization of the structure may otherwise make an already-expensive
+function even more costly.
+Most of these improvements are present in the GFS system, as it derives
+from the beta-test version of 4.3BSD.
+The Sun system uses a name-translation cache generally like that in 4.3BSD.
+The name cache is a filesystem-independent facility provided for the use
+of the filesystem-specific lookup routines.
+The Sun cache, like that first used at Berkeley but unlike that in 4.3,
+holds a ``hard'' reference to the vnode (increments the reference count).
+The ``soft'' reference scheme in 4.3BSD cannot be used with the current
+NFS implementation, as NFS allocates vnodes dynamically and frees them
+when the reference count returns to zero rather than caching them.
+As a result, fewer names may be held in the cache
+than (local filesystem) vnodes, and the cache distorts the normal reference
+patterns otherwise seen by the LRU cache.
+As the name cache references overflow the local filesystem inode table,
+the name cache must be purged to make room in the inode table.
+Also, to determine whether a vnode is in use (for example,
+before mounting upon it), the cache must be flushed to free any
+cache reference.
+These problems should be corrected
+by the use of the soft cache reference scheme.
+.PP
+A final observation on the efficiency of name translation in the current
+Sun VFS architecture is that the number of subroutine calls used
+by a multi-component name lookup is dramatically larger
+than in the other systems.
+The name lookup scheme in GFS suffers from this problem much less,
+at no expense in violation of layering.
+.PP
+A final problem to be considered is synchronization and consistency.
+As the filesystem operations are more stylized and broken into separate
+entry points for parts of operations, it is more difficult to guarantee
+consistency throughout an operation and/or to synchronize with other
+processes using the same filesystem objects.
+The Sun interface suffers most severely from this,
+as it forbids the filesystems from locking objects across calls
+to the filesystem.
+It is possible that a file may be created between the time that a lookup
+is performed and a subsequent creation is requested.
+Perhaps more strangely, after a lookup fails to find the target
+of a creation attempt, the actual creation might find that the target
+now exists and is a symbolic link.
+The call will either fail unexpectedly, as the target is of the wrong type,
+or the generic creation routine will have to note the error
+and restart the operation from the lookup.
+This problem will always exist in a stateless filesystem,
+but the VFS interface forces all filesystems to share the problem.
+This restriction against locking between calls also
+forces duplication of work during file creation and deletion.
+This is considered unacceptable.
+.SH
+Support facilities and other interactions
+.PP
+Several support facilities are used by the current
+.UX
+filesystem and require generalization for use by other filesystem types.
+For filesystem implementations to be portable,
+it is desirable that these modified support facilities
+should also have a uniform interface and
+behave in a consistent manner in target systems.
+A prominent example is the filesystem buffer cache.
+The buffer cache in a standard (System V or 4.3BSD)
+.UX
+system contains physical disk blocks with no reference to the files containing
+them.
+This works well for the local filesystem, but has obvious problems
+for remote filesystems.
+Sun has modified the buffer cache routines to describe buffers by vnode
+rather than by device.
+For remote files, the vnode used is that of the file, and the block
+numbers are virtual data blocks.
+For local filesystems, a vnode for the block device is used for cache reference,
+and the block numbers are filesystem physical blocks.
+Use of per-file cache description does not easily accommodate
+caching of indirect blocks, inode blocks, superblocks or cylinder group blocks.
+However, the vnode describing the block device for the cache
+is one created internally,
+rather than the vnode for the device looked up when mounting,
+and it is located by searching a private list of vnodes
+rather than by holding it in the mount structure.
+Although the Sun modification makes it possible to use the buffer
+cache for data blocks of remote files, a better generalization
+of the buffer cache is needed.
+.PP
+The RFS filesystem used by AT&T does not currently cache data blocks
+on client systems, thus the buffer cache is probably unmodified.
+The form of the buffer cache in ULTRIX is unknown to us.
+.PP
+Another subsystem that has a large interaction with the filesystem
+is the virtual memory system.
+The virtual memory system must read data from the filesystem
+to satisfy fill-on-demand page faults.
+For efficiency, this read call is arranged to place the data directly
+into the physical pages assigned to the process (a ``raw'' read) to avoid
+copying the data.
+Although the read operation normally bypasses the filesystem buffer cache,
+consistency must be maintained by checking the buffer cache and copying
+or flushing modified data not yet stored on disk.
+The 4.2BSD virtual memory system, like that of Sun and ULTRIX,
+maintains its own cache of reusable text pages.
+This creates additional complications.
+As the virtual memory systems are redesigned, these problems should be
+resolved by reading through the buffer cache, then mapping the cached
+data into the user address space.
+If the buffer cache or the process pages are changed while the other reference
+remains, the data would have to be copied (``copy-on-write'').
+.PP
+In the meantime, the current virtual memory systems must be used
+with the new filesystem framework.
+Both the Sun and AT&T filesystem interfaces
+provide entry points to the filesystem for optimization of the virtual
+memory system by performing logical-to-physical block number translation
+when setting up a fill-on-demand image for a process.
+The VFS provides a vnode operation analogous to the \fIbmap\fP function of the
+.UX
+filesystem.
+Given a vnode and logical block number, it returns a vnode and block number
+which may be read to obtain the data.
+If the filesystem is local, it returns the private vnode for the block device
+and the physical block number.
+As the \fIbmap\fP operations are all performed at one time, during process
+startup, any indirect blocks for the file will remain in the cache
+after they are once read.
+In addition, the interface provides a \fIstrategy\fP entry that may be used
+for ``raw'' reads from a filesystem device,
+used to read data blocks into an address space without copying.
+This entry uses a buffer header (\fIbuf\fP structure)
+to describe the I/O operation
+instead of a \fIuio\fP structure.
+The buffer-style interface is the same as that used by disk drivers internally.
+This difference allows the current \fIuio\fP primitives to be avoided,
+as they copy all data to/from the current user process address space.
+Instead, for local filesystems these operations could be done internally
+with the standard raw disk read routines,
+which use a \fIuio\fP interface.
+When loading from a remote filesystems,
+the data will be received in a network buffer.
+If network buffers are suitably aligned,
+the data may be mapped into the process address space by a page swap
+without copying.
+In either case, it should be possible to use the standard filesystem
+read entry from the virtual memory system.
+.PP
+Other issues that must be considered in devising a portable
+filesystem implementation include kernel memory allocation,
+the implicit use of user-structure global context,
+which may create problems with reentrancy,
+the style of the system call interface,
+and the conventions for synchronization
+(sleep/wakeup, handling of interrupted system calls, semaphores).
+.SH
+The Berkeley Proposal
+.PP
+The Sun VFS interface has been most widely used of the three described here.
+It is also the most general of the three, in that filesystem-specific
+data and operations are best separated from the generic layer.
+Although it has several disadvantages which were described above,
+most of them may be corrected with minor changes to the interface
+(and, in a few areas, philosophical changes).
+The DEC GFS has other advantages, in particular the use of the 4.3BSD
+\fInamei\fP interface and optimizations.
+It allows single or multiple components of a pathname
+to be translated in a single call to the specific filesystem
+and thus accommodates filesystems with either preference.
+The FSS is least well understood, as there is little public information
+about the interface.
+However, the design goals are the least consistent with those of the Berkeley
+research groups.
+Accordingly, a new filesystem interface has been devised to avoid
+some of the problems in the other systems.
+The proposed interface derives directly from Sun's VFS,
+but, like GFS, uses a 4.3BSD-style name lookup interface.
+Additional context information has been moved from the \fIuser\fP structure
+to the \fInameidata\fP structure so that name translation may be independent
+of the global context of a user process.
+This is especially desired in any system where kernel-mode servers
+operate as light-weight or interrupt-level processes,
+or where a server may store or cache context for several clients.
+This calling interface has the additional advantage
+that the call parameters need not all be pushed onto the stack for each call
+through the filesystem interface,
+and they may be accessed using short offsets from a base pointer
+(unlike global variables in the \fIuser\fP structure).
+.PP
+The proposed filesystem interface is described very tersely here.
+For the most part, data structures and procedures are analogous
+to those used by VFS, and only the changes will be be treated here.
+See [Kleiman86] for complete descriptions of the vfs and vnode operations
+in Sun's interface.
+.PP
+The central data structure for name translation is the \fInameidata\fP
+structure.
+The same structure is used to pass parameters to \fInamei\fP,
+to pass these same parameters to filesystem-specific lookup routines,
+to communicate completion status from the lookup routines back to \fInamei\fP,
+and to return completion status to the calling routine.
+For creation or deletion requests, the parameters to the filesystem operation
+to complete the request are also passed in this same structure.
+The form of the \fInameidata\fP structure is:
+.br
+.ne 2i
+.ID
+.nf
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Encapsulation of namei parameters.
+ * One of these is located in the u. area to
+ * minimize space allocated on the kernel stack
+ * and to retain per-process context.
+ */
+struct nameidata {
+ /* arguments to namei and related context: */
+ caddr_t ni_dirp; /* pathname pointer */
+ enum uio_seg ni_seg; /* location of pathname */
+ short ni_nameiop; /* see below */
+ struct vnode *ni_cdir; /* current directory */
+ struct vnode *ni_rdir; /* root directory, if not normal root */
+ struct ucred *ni_cred; /* credentials */
+
+ /* shared between namei, lookup routines and commit routines: */
+ caddr_t ni_pnbuf; /* pathname buffer */
+ char *ni_ptr; /* current location in pathname */
+ int ni_pathlen; /* remaining chars in path */
+ short ni_more; /* more left to translate in pathname */
+ short ni_loopcnt; /* count of symlinks encountered */
+
+ /* results: */
+ struct vnode *ni_vp; /* vnode of result */
+ struct vnode *ni_dvp; /* vnode of intermediate directory */
+
+/* BEGIN UFS SPECIFIC */
+ struct diroffcache { /* last successful directory search */
+ struct vnode *nc_prevdir; /* terminal directory */
+ long nc_id; /* directory's unique id */
+ off_t nc_prevoffset; /* where last entry found */
+ } ni_nc;
+/* END UFS SPECIFIC */
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+/*
+ * namei operations and modifiers
+ */
+#define LOOKUP 0 /* perform name lookup only */
+#define CREATE 1 /* setup for file creation */
+#define DELETE 2 /* setup for file deletion */
+#define WANTPARENT 0x10 /* return parent directory vnode also */
+#define NOCACHE 0x20 /* name must not be left in cache */
+#define FOLLOW 0x40 /* follow symbolic links */
+#define NOFOLLOW 0x0 /* don't follow symbolic links (pseudo) */
+.DE
+As in current systems other than Sun's VFS, \fInamei\fP is called
+with an operation request, one of LOOKUP, CREATE or DELETE.
+For a LOOKUP, the operation is exactly like the lookup in VFS.
+CREATE and DELETE allow the filesystem to ensure consistency
+by locking the parent inode (private to the filesystem),
+and (for the local filesystem) to avoid duplicate directory scans
+by storing the new directory entry and its offset in the directory
+in the \fIndirinfo\fP structure.
+This is intended to be opaque to the filesystem-independent levels.
+Not all lookups for creation or deletion are actually followed
+by the intended operation; permission may be denied, the filesystem
+may be read-only, etc.
+Therefore, an entry point to the filesystem is provided
+to abort a creation or deletion operation
+and allow release of any locked internal data.
+After a \fInamei\fP with a CREATE or DELETE flag, the pathname pointer
+is set to point to the last filename component.
+Filesystems that choose to implement creation or deletion entirely
+within the subsequent call to a create or delete entry
+are thus free to do so.
+.PP
+The \fInameidata\fP is used to store context used during name translation.
+The current and root directories for the translation are stored here.
+For the local filesystem, the per-process directory offset cache
+is also kept here.
+A file server could leave the directory offset cache empty,
+could use a single cache for all clients,
+or could hold caches for several recent clients.
+.PP
+Several other data structures are used in the filesystem operations.
+One is the \fIucred\fP structure which describes a client's credentials
+to the filesystem.
+This is modified slightly from the Sun structure;
+the ``accounting'' group ID has been merged into the groups array.
+The actual number of groups in the array is given explicitly
+to avoid use of a reserved group ID as a terminator.
+Also, typedefs introduced in 4.3BSD for user and group ID's have been used.
+The \fIucred\fP structure is thus:
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Credentials.
+ */
+struct ucred {
+ u_short cr_ref; /* reference count */
+ uid_t cr_uid; /* effective user id */
+ short cr_ngroups; /* number of groups */
+ gid_t cr_groups[NGROUPS]; /* groups */
+ /*
+ * The following either should not be here,
+ * or should be treated as opaque.
+ */
+ uid_t cr_ruid; /* real user id */
+ gid_t cr_svgid; /* saved set-group id */
+};
+.DE
+.PP
+A final structure used by the filesystem interface is the \fIuio\fP
+structure mentioned earlier.
+This structure describes the source or destination of an I/O
+operation, with provision for scatter/gather I/O.
+It is used in the read and write entries to the filesystem.
+The \fIuio\fP structure presented here is modified from the one
+used in 4.2BSD to specify the location of each vector of the operation
+(user or kernel space)
+and to allow an alternate function to be used to implement the data movement.
+The alternate function might perform page remapping rather than a copy,
+for example.
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Description of an I/O operation which potentially
+ * involves scatter-gather, with individual sections
+ * described by iovec, below. uio_resid is initially
+ * set to the total size of the operation, and is
+ * decremented as the operation proceeds. uio_offset
+ * is incremented by the amount of each operation.
+ * uio_iov is incremented and uio_iovcnt is decremented
+ * after each vector is processed.
+ */
+struct uio {
+ struct iovec *uio_iov;
+ int uio_iovcnt;
+ off_t uio_offset;
+ int uio_resid;
+ enum uio_rw uio_rw;
+};
+
+enum uio_rw { UIO_READ, UIO_WRITE };
+.DE
+.DS
+.ta .5i +\w'caddr_t\0\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+/*
+ * Description of a contiguous section of an I/O operation.
+ * If iov_op is non-null, it is called to implement the copy
+ * operation, possibly by remapping, with the call
+ * (*iov_op)(from, to, count);
+ * where from and to are caddr_t and count is int.
+ * Otherwise, the copy is done in the normal way,
+ * treating base as a user or kernel virtual address
+ * according to iov_segflg.
+ */
+struct iovec {
+ caddr_t iov_base;
+ int iov_len;
+ enum uio_seg iov_segflg;
+ int (*iov_op)();
+};
+.DE
+.DS
+.ta .5i +\w'UIO_USERISPACE\0\0\0\0\0'u
+/*
+ * Segment flag values.
+ */
+enum uio_seg {
+ UIO_USERSPACE, /* from user data space */
+ UIO_SYSSPACE, /* from system space */
+ UIO_USERISPACE /* from user I space */
+};
+.DE
+.SH
+File and filesystem operations
+.PP
+With the introduction of the data structures used by the filesystem
+operations, the complete list of filesystem entry points may be listed.
+As noted, they derive mostly from the Sun VFS interface.
+Lines marked with \fB+\fP are additions to the Sun definitions;
+lines marked with \fB!\fP are modified from VFS.
+.PP
+The structure describing the externally-visible features of a mounted
+filesystem, \fIvfs\fP, is:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * Structure per mounted file system.
+ * Each mounted file system has an array of
+ * operations and an instance record.
+ * The file systems are put on a doubly linked list.
+ */
+struct vfs {
+ struct vfs *vfs_next; /* next vfs in vfs list */
+\fB+\fP struct vfs *vfs_prev; /* prev vfs in vfs list */
+ struct vfsops *vfs_op; /* operations on vfs */
+ struct vnode *vfs_vnodecovered; /* vnode we mounted on */
+ int vfs_flag; /* flags */
+\fB!\fP int vfs_fsize; /* fundamental block size */
+\fB+\fP int vfs_bsize; /* optimal transfer size */
+\fB!\fP uid_t vfs_exroot; /* exported fs uid 0 mapping */
+ short vfs_exflags; /* exported fs flags */
+ caddr_t vfs_data; /* private data */
+};
+.DE
+.DS
+.ta \w'\fB+\fP 'u +\w'#define\0\0'u +\w'VFS_EXPORTED\0\0'u +\w'0x40\0\0\0\0\0'u
+ /*
+ * vfs flags.
+ * VFS_MLOCK lock the vfs so that name lookup cannot proceed past the vfs.
+ * This keeps the subtree stable during mounts and unmounts.
+ */
+ #define VFS_RDONLY 0x01 /* read only vfs */
+\fB+\fP #define VFS_NOEXEC 0x02 /* can't exec from filesystem */
+ #define VFS_MLOCK 0x04 /* lock vfs so that subtree is stable */
+ #define VFS_MWAIT 0x08 /* someone is waiting for lock */
+ #define VFS_NOSUID 0x10 /* don't honor setuid bits on vfs */
+ #define VFS_EXPORTED 0x20 /* file system is exported (NFS) */
+
+ /*
+ * exported vfs flags.
+ */
+ #define EX_RDONLY 0x01 /* exported read only */
+.DE
+.LP
+The operations supported by the filesystem-specific layer
+on an individual filesystem are:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * Operations supported on virtual file system.
+ */
+struct vfsops {
+\fB!\fP int (*vfs_mount)( /* vfs, path, data, datalen */ );
+\fB!\fP int (*vfs_unmount)( /* vfs, forcibly */ );
+\fB+\fP int (*vfs_mountroot)();
+ int (*vfs_root)( /* vfs, vpp */ );
+\fB!\fP int (*vfs_statfs)( /* vfs, vp, sbp */ );
+\fB!\fP int (*vfs_sync)( /* vfs, waitfor */ );
+\fB+\fP int (*vfs_fhtovp)( /* vfs, fhp, vpp */ );
+\fB+\fP int (*vfs_vptofh)( /* vp, fhp */ );
+};
+.DE
+.LP
+The \fIvfs_statfs\fP entry returns a structure of the form:
+.DS
+.ta .5i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+/*
+ * file system statistics
+ */
+struct statfs {
+\fB!\fP short f_type; /* type of filesystem */
+\fB+\fP short f_flags; /* copy of vfs (mount) flags */
+\fB!\fP long f_fsize; /* fundamental file system block size */
+\fB+\fP long f_bsize; /* optimal transfer block size */
+ long f_blocks; /* total data blocks in file system */
+ long f_bfree; /* free blocks in fs */
+ long f_bavail; /* free blocks avail to non-superuser */
+ long f_files; /* total file nodes in file system */
+ long f_ffree; /* free file nodes in fs */
+ fsid_t f_fsid; /* file system id */
+\fB+\fP char *f_mntonname; /* directory on which mounted */
+\fB+\fP char *f_mntfromname; /* mounted filesystem */
+ long f_spare[7]; /* spare for later */
+};
+
+typedef long fsid_t[2]; /* file system id type */
+.DE
+.LP
+The modifications to Sun's interface at this level are minor.
+Additional arguments are present for the \fIvfs_mount\fP and \fIvfs_umount\fP
+entries.
+\fIvfs_statfs\fP accepts a vnode as well as filesystem identifier,
+as the information may not be uniform throughout a filesystem.
+For example,
+if a client may mount a file tree that spans multiple physical
+filesystems on a server, different sections may have different amounts
+of free space.
+(NFS does not allow remotely-mounted file trees to span physical filesystems
+on the server.)
+The final additions are the entries that support file handles.
+\fIvfs_vptofh\fP is provided for the use of file servers,
+which need to obtain an opaque
+file handle to represent the current vnode for transmission to clients.
+This file handle may later be used to relocate the vnode using \fIvfs_fhtovp\fP
+without requiring the vnode to remain in memory.
+.PP
+Finally, the external form of a filesystem object, the \fIvnode\fP, is:
+.DS
+.ta .5i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+/*
+ * vnode types. VNON means no type.
+ */
+enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK };
+
+struct vnode {
+ u_short v_flag; /* vnode flags (see below) */
+ u_short v_count; /* reference count */
+ u_short v_shlockc; /* count of shared locks */
+ u_short v_exlockc; /* count of exclusive locks */
+ struct vfs *v_vfsmountedhere; /* ptr to vfs mounted here */
+ struct vfs *v_vfsp; /* ptr to vfs we are in */
+ struct vnodeops *v_op; /* vnode operations */
+\fB+\fP struct text *v_text; /* text/mapped region */
+ enum vtype v_type; /* vnode type */
+ caddr_t v_data; /* private data for fs */
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+/*
+ * vnode flags.
+ */
+#define VROOT 0x01 /* root of its file system */
+#define VTEXT 0x02 /* vnode is a pure text prototype */
+#define VEXLOCK 0x10 /* exclusive lock */
+#define VSHLOCK 0x20 /* shared lock */
+#define VLWAIT 0x40 /* proc is waiting on shared or excl. lock */
+.DE
+.LP
+The operations supported by the filesystems on individual \fIvnode\fP\^s
+are:
+.DS
+.ta .5i +\w'int\0\0\0\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+/*
+ * Operations on vnodes.
+ */
+struct vnodeops {
+\fB!\fP int (*vn_lookup)( /* ndp */ );
+\fB!\fP int (*vn_create)( /* ndp, vap, fflags */ );
+\fB+\fP int (*vn_mknod)( /* ndp, vap, fflags */ );
+\fB!\fP int (*vn_open)( /* vp, fflags, cred */ );
+ int (*vn_close)( /* vp, fflags, cred */ );
+ int (*vn_access)( /* vp, fflags, cred */ );
+ int (*vn_getattr)( /* vp, vap, cred */ );
+ int (*vn_setattr)( /* vp, vap, cred */ );
+
+\fB+\fP int (*vn_read)( /* vp, uiop, offp, ioflag, cred */ );
+\fB+\fP int (*vn_write)( /* vp, uiop, offp, ioflag, cred */ );
+\fB!\fP int (*vn_ioctl)( /* vp, com, data, fflag, cred */ );
+ int (*vn_select)( /* vp, which, cred */ );
+\fB+\fP int (*vn_mmap)( /* vp, ..., cred */ );
+ int (*vn_fsync)( /* vp, cred */ );
+\fB+\fP int (*vn_seek)( /* vp, offp, off, whence */ );
+
+\fB!\fP int (*vn_remove)( /* ndp */ );
+\fB!\fP int (*vn_link)( /* vp, ndp */ );
+\fB!\fP int (*vn_rename)( /* src ndp, target ndp */ );
+\fB!\fP int (*vn_mkdir)( /* ndp, vap */ );
+\fB!\fP int (*vn_rmdir)( /* ndp */ );
+\fB!\fP int (*vn_symlink)( /* ndp, vap, nm */ );
+ int (*vn_readdir)( /* vp, uiop, offp, ioflag, cred */ );
+ int (*vn_readlink)( /* vp, uiop, ioflag, cred */ );
+
+\fB+\fP int (*vn_abortop)( /* ndp */ );
+\fB+\fP int (*vn_lock)( /* vp */ );
+\fB+\fP int (*vn_unlock)( /* vp */ );
+\fB!\fP int (*vn_inactive)( /* vp */ );
+};
+.DE
+.DS
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0'u
+/*
+ * flags for ioflag
+ */
+#define IO_UNIT 0x01 /* do io as atomic unit for VOP_RDWR */
+#define IO_APPEND 0x02 /* append write for VOP_RDWR */
+#define IO_SYNC 0x04 /* sync io for VOP_RDWR */
+.DE
+.LP
+The argument types listed in the comments following each operation are:
+.sp
+.IP ndp 10
+A pointer to a \fInameidata\fP structure.
+.IP vap
+A pointer to a \fIvattr\fP structure (vnode attributes; see below).
+.IP fflags
+File open flags, possibly including O_APPEND, O_CREAT, O_TRUNC and O_EXCL.
+.IP vp
+A pointer to a \fIvnode\fP previously obtained with \fIvn_lookup\fP.
+.IP cred
+A pointer to a \fIucred\fP credentials structure.
+.IP uiop
+A pointer to a \fIuio\fP structure.
+.IP ioflag
+Any of the IO flags defined above.
+.IP com
+An \fIioctl\fP command, with type \fIunsigned long\fP.
+.IP data
+A pointer to a character buffer used to pass data to or from an \fIioctl\fP.
+.IP which
+One of FREAD, FWRITE or 0 (select for exceptional conditions).
+.IP off
+A file offset of type \fIoff_t\fP.
+.IP offp
+A pointer to file offset of type \fIoff_t\fP.
+.IP whence
+One of L_SET, L_INCR, or L_XTND.
+.IP fhp
+A pointer to a file handle buffer.
+.sp
+.PP
+Several changes have been made to Sun's set of vnode operations.
+Most obviously, the \fIvn_lookup\fP receives a \fInameidata\fP structure
+containing its arguments and context as described.
+The same structure is also passed to one of the creation or deletion
+entries if the lookup operation is for CREATE or DELETE to complete
+an operation, or to the \fIvn_abortop\fP entry if no operation
+is undertaken.
+For filesystems that perform no locking between lookup for creation
+or deletion and the call to implement that action,
+the final pathname component may be left untranslated by the lookup
+routine.
+In any case, the pathname pointer points at the final name component,
+and the \fInameidata\fP contains a reference to the vnode of the parent
+directory.
+The interface is thus flexible enough to accommodate filesystems
+that are fully stateful or fully stateless, while avoiding redundant
+operations whenever possible.
+One operation remains problematical, the \fIvn_rename\fP call.
+It is tempting to look up the source of the rename for deletion
+and the target for creation.
+However, filesystems that lock directories during such lookups must avoid
+deadlock if the two paths cross.
+For that reason, the source is translated for LOOKUP only,
+with the WANTPARENT flag set;
+the target is then translated with an operation of CREATE.
+.PP
+In addition to the changes concerned with the \fInameidata\fP interface,
+several other changes were made in the vnode operations.
+The \fIvn_rdrw\fP entry was split into \fIvn_read\fP and \fIvn_write\fP;
+frequently, the read/write entry amounts to a routine that checks
+the direction flag, then calls either a read routine or a write routine.
+The two entries may be identical for any given filesystem;
+the direction flag is contained in the \fIuio\fP given as an argument.
+.PP
+All of the read and write operations use a \fIuio\fP to describe
+the file offset and buffer locations.
+All of these fields must be updated before return.
+In particular, the \fIvn_readdir\fP entry uses this
+to return a new file offset token for its current location.
+.PP
+Several new operations have been added.
+The first, \fIvn_seek\fP, is a concession to record-oriented files
+such as directories.
+It allows the filesystem to verify that a seek leaves a file at a sensible
+offset, or to return a new offset token relative to an earlier one.
+For most filesystems and files, this operation amounts to performing
+simple arithmetic.
+Another new entry point is \fIvn_mmap\fP, for use in mapping device memory
+into a user process address space.
+Its semantics are not yet decided.
+The final additions are the \fIvn_lock\fP and \fIvn_unlock\fP entries.
+These are used to request that the underlying file be locked against
+changes for short periods of time if the filesystem implementation allows it.
+They are used to maintain consistency
+during internal operations such as \fIexec\fP,
+and may not be used to construct atomic operations from other filesystem
+operations.
+.PP
+The attributes of a vnode are not stored in the vnode,
+as they might change with time and may need to be read from a remote
+source.
+Attributes have the form:
+.DS
+.ta .5i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+/*
+ * Vnode attributes. A field value of -1
+ * represents a field whose value is unavailable
+ * (getattr) or which is not to be changed (setattr).
+ */
+struct vattr {
+ enum vtype va_type; /* vnode type (for create) */
+ u_short va_mode; /* files access mode and type */
+\fB!\fP uid_t va_uid; /* owner user id */
+\fB!\fP gid_t va_gid; /* owner group id */
+ long va_fsid; /* file system id (dev for now) */
+\fB!\fP long va_fileid; /* file id */
+ short va_nlink; /* number of references to file */
+ u_long va_size; /* file size in bytes (quad?) */
+\fB+\fP u_long va_size1; /* reserved if not quad */
+ long va_blocksize; /* blocksize preferred for i/o */
+ struct timeval va_atime; /* time of last access */
+ struct timeval va_mtime; /* time of last modification */
+ struct timeval va_ctime; /* time file changed */
+ dev_t va_rdev; /* device the file represents */
+ u_long va_bytes; /* bytes of disk space held by file */
+\fB+\fP u_long va_bytes1; /* reserved if va_bytes not a quad */
+};
+.DE
+.SH
+Conclusions
+.PP
+The Sun VFS filesystem interface is the most widely used generic
+filesystem interface.
+Of the interfaces examined, it creates the cleanest separation
+between the filesystem-independent and -dependent layers and data structures.
+It has several flaws, but it is felt that certain changes in the interface
+can ameliorate most of them.
+The interface proposed here includes those changes.
+The proposed interface is now being implemented by the Computer Systems
+Research Group at Berkeley.
+If the design succeeds in improving the flexibility and performance
+of the filesystem layering, it will be advanced as a model interface.
+.SH
+Acknowledgements
+.PP
+The filesystem interface described here is derived from Sun's VFS interface.
+It also includes features similar to those of DEC's GFS interface.
+We are indebted to members of the Sun and DEC system groups
+for long discussions of the issues involved.
+.br
+.ne 2i
+.SH
+References
+
+.IP Brownbridge82 \w'Satyanarayanan85\0\0'u
+Brownbridge, D.R., L.F. Marshall, B. Randell,
+``The Newcastle Connection, or UNIXes of the World Unite!,''
+\fISoftware\- Practice and Experience\fP, Vol. 12, pp. 1147-1162, 1982.
+
+.IP Cole85
+Cole, C.T., P.B. Flinn, A.B. Atlas,
+``An Implementation of an Extended File System for UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 131-150, June, 1985.
+
+.IP Kleiman86
+``Vnodes: An Architecture for Multiple File System Types in Sun UNIX,''
+\fIUsenix Conference Proceedings\fP,
+pp. 238-247, June, 1986.
+
+.IP Leffler84
+Leffler, S., M.K. McKusick, M. Karels,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 237-252, June, 1984.
+
+.IP McKusick84
+McKusick, M.K., W.N. Joy, S.J. Leffler, R.S. Fabry,
+``A Fast File System for UNIX,'' \fITransactions on Computer Systems\fP,
+Vol. 2, pp. 181-197,
+ACM, August, 1984.
+
+.IP McKusick85
+McKusick, M.K., M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD,''
+\fIUsenix Conference Proceedings\fP, pp. 519-531, June, 1985.
+
+.IP Rifkin86
+Rifkin, A.P., M.P. Forbes, R.L. Hamilton, M. Sabrio, S. Shah, and K. Yueh,
+``RFS Architectural Overview,'' \fIUsenix Conference Proceedings\fP,
+pp. 248-259, June, 1986.
+
+.IP Ritchie74
+Ritchie, D.M. and K. Thompson, ``The Unix Time-Sharing System,''
+\fICommunications of the ACM\fP, Vol. 17, pp. 365-375, July, 1974.
+
+.IP Rodriguez86
+Rodriguez, R., M. Koehler, R. Hyde,
+``The Generic File System,'' \fIUsenix Conference Proceedings\fP,
+pp. 260-269, June, 1986.
+
+.IP Sandberg85
+Sandberg, R., D. Goldberg, S. Kleiman, D. Walsh, B. Lyon,
+``Design and Implementation of the Sun Network Filesystem,''
+\fIUsenix Conference Proceedings\fP,
+pp. 119-130, June, 1985.
+
+.IP Satyanarayanan85
+Satyanarayanan, M., \fIet al.\fP,
+``The ITC Distributed File System: Principles and Design,''
+\fIProc. 10th Symposium on Operating Systems Principles\fP, pp. 35-50,
+ACM, December, 1985.
+
+.IP Walker85
+Walker, B.J. and S.H. Kiser, ``The LOCUS Distributed Filesystem,''
+\fIThe LOCUS Distributed System Architecture\fP,
+G.J. Popek and B.J. Walker, ed., The MIT Press, Cambridge, MA, 1985.
+
+.IP Weinberger84
+Weinberger, P.J., ``The Version 8 Network File System,''
+\fIUsenix Conference presentation\fP,
+June, 1984.
diff --git a/share/doc/papers/fsinterface/slides.t b/share/doc/papers/fsinterface/slides.t
new file mode 100644
index 000000000000..3caaafbeea59
--- /dev/null
+++ b/share/doc/papers/fsinterface/slides.t
@@ -0,0 +1,318 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)slides.t 5.2 (Berkeley) 4/16/91
+.\"
+.so macros
+.nf
+.LL
+Encapsulation of namei parameters
+.NP 0
+.ta .5i +\w'caddr_t\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct nameidata {
+ /* arguments and context: */
+ caddr_t ni_dirp;
+ enum uio_seg ni_seg;
+ short ni_nameiop;
+ struct vnode *ni_cdir;
+ struct vnode *ni_rdir;
+ struct ucred *ni_cred;
+.sp .2
+ /* shared with lookup and commit: */
+ caddr_t ni_pnbuf;
+ char *ni_ptr;
+ int ni_pathlen;
+ short ni_more;
+ short ni_loopcnt;
+.sp .2
+ /* results: */
+ struct vnode *ni_vp;
+ struct vnode *ni_dvp;
+.sp .2
+/* BEGIN UFS SPECIFIC */
+ struct diroffcache {
+ struct vnode *nc_prevdir;
+ long nc_id;
+ off_t nc_prevoffset;
+ } ni_nc;
+/* END UFS SPECIFIC */
+};
+.bp
+
+
+.LL
+Namei operations and modifiers
+
+.NP 0
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+#define LOOKUP 0 /* name lookup only */
+#define CREATE 1 /* setup for creation */
+#define DELETE 2 /* setup for deletion */
+#define WANTPARENT 0x10 /* return parent vnode also */
+#define NOCACHE 0x20 /* remove name from cache */
+#define FOLLOW 0x40 /* follow symbolic links */
+.bp
+
+.LL
+Namei operations and modifiers
+
+.NP 0
+.ta \w'#define\0\0'u +\w'WANTPARENT\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+#define LOOKUP 0
+#define CREATE 1
+#define DELETE 2
+#define WANTPARENT 0x10
+#define NOCACHE 0x20
+#define FOLLOW 0x40
+.bp
+
+
+.LL
+Credentials
+
+.NP 0
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct ucred {
+ u_short cr_ref;
+ uid_t cr_uid;
+ short cr_ngroups;
+ gid_t cr_groups[NGROUPS];
+ /*
+ * The following either should not be here,
+ * or should be treated as opaque.
+ */
+ uid_t cr_ruid;
+ gid_t cr_svgid;
+};
+.bp
+.LL
+Scatter-gather I/O
+.NP 0
+.ta .5i +\w'caddr_t\0\0\0'u +\w'struct\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct uio {
+ struct iovec *uio_iov;
+ int uio_iovcnt;
+ off_t uio_offset;
+ int uio_resid;
+ enum uio_rw uio_rw;
+};
+
+enum uio_rw { UIO_READ, UIO_WRITE };
+
+
+
+.ta .5i +\w'caddr_t\0\0\0'u +\w'vnode *nc_prevdir;\0\0\0\0\0'u
+struct iovec {
+ caddr_t iov_base;
+ int iov_len;
+ enum uio_seg iov_segflg;
+ int (*iov_op)();
+};
+.bp
+.LL
+Per-filesystem information
+.NP 0
+.ta .25i +\w'struct vfsops\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+struct vfs {
+ struct vfs *vfs_next;
+\fB+\fP struct vfs *vfs_prev;
+ struct vfsops *vfs_op;
+ struct vnode *vfs_vnodecovered;
+ int vfs_flag;
+\fB!\fP int vfs_fsize;
+\fB+\fP int vfs_bsize;
+\fB!\fP uid_t vfs_exroot;
+ short vfs_exflags;
+ caddr_t vfs_data;
+};
+
+.NP 0
+.ta \w'\fB+\fP 'u +\w'#define\0\0'u +\w'VFS_EXPORTED\0\0'u +\w'0x40\0\0\0\0\0'u
+ /* vfs flags: */
+ #define VFS_RDONLY 0x01
+\fB+\fP #define VFS_NOEXEC 0x02
+ #define VFS_MLOCK 0x04
+ #define VFS_MWAIT 0x08
+ #define VFS_NOSUID 0x10
+ #define VFS_EXPORTED 0x20
+
+ /* exported vfs flags: */
+ #define EX_RDONLY 0x01
+.bp
+
+
+.LL
+Operations supported on virtual file system.
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'*vfs_mountroot();\0'u
+struct vfsops {
+\fB!\fP int (*vfs_mount)(vfs, path, data, len);
+\fB!\fP int (*vfs_unmount)(vfs, forcibly);
+\fB+\fP int (*vfs_mountroot)();
+ int (*vfs_root)(vfs, vpp);
+ int (*vfs_statfs)(vfs, sbp);
+\fB!\fP int (*vfs_sync)(vfs, waitfor);
+\fB+\fP int (*vfs_fhtovp)(vfs, fhp, vpp);
+\fB+\fP int (*vfs_vptofh)(vp, fhp);
+};
+.bp
+
+
+.LL
+Dynamic file system information
+
+.NP 0
+.ta .5i +\w'struct\0\0\0'u +\w'*vfs_vnodecovered;\0\0\0\0\0'u
+struct statfs {
+\fB!\fP short f_type;
+\fB+\fP short f_flags;
+\fB!\fP long f_fsize;
+\fB+\fP long f_bsize;
+ long f_blocks;
+ long f_bfree;
+ long f_bavail;
+ long f_files;
+ long f_ffree;
+ fsid_t f_fsid;
+\fB+\fP char *f_mntonname;
+\fB+\fP char *f_mntfromname;
+ long f_spare[7];
+};
+
+typedef long fsid_t[2];
+.bp
+.LL
+Filesystem objects (vnodes)
+.NP 0
+.ta .25i +\w'struct vnodeops\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK };
+
+struct vnode {
+ u_short v_flag;
+ u_short v_count;
+ u_short v_shlockc;
+ u_short v_exlockc;
+ struct vfs *v_vfsmountedhere;
+ struct vfs *v_vfsp;
+ struct vnodeops *v_op;
+\fB+\fP struct text *v_text;
+ enum vtype v_type;
+ caddr_t v_data;
+};
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0\0\0'u
+
+/* vnode flags */
+#define VROOT 0x01
+#define VTEXT 0x02
+#define VEXLOCK 0x10
+#define VSHLOCK 0x20
+#define VLWAIT 0x40
+.bp
+.LL
+Operations on vnodes
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+struct vnodeops {
+\fB!\fP int (*vn_lookup)(ndp);
+\fB!\fP int (*vn_create)(ndp, vap, fflags);
+\fB+\fP int (*vn_mknod)(ndp, vap, fflags);
+\fB!\fP int (*vn_open)(vp, fflags, cred);
+ int (*vn_close)(vp, fflags, cred);
+ int (*vn_access)(vp, fflags, cred);
+ int (*vn_getattr)(vp, vap, cred);
+ int (*vn_setattr)(vp, vap, cred);
+.sp .5
+\fB+\fP int (*vn_read)(vp, uiop,
+ offp, ioflag, cred);
+\fB+\fP int (*vn_write)(vp, uiop,
+ offp, ioflag, cred);
+\fB!\fP int (*vn_ioctl)(vp, com,
+ data, fflag, cred);
+ int (*vn_select)(vp, which, cred);
+\fB+\fP int (*vn_mmap)(vp, ..., cred);
+ int (*vn_fsync)(vp, cred);
+\fB+\fP int (*vn_seek)(vp, offp, off,
+ whence);
+.bp
+.LL
+Operations on vnodes (cont)
+
+.NP 0
+.ta .25i +\w'int\0\0'u +\w'(*vn_getattr)(\0\0\0\0\0'u
+
+\fB!\fP int (*vn_remove)(ndp);
+\fB!\fP int (*vn_link)(vp, ndp);
+\fB!\fP int (*vn_rename)(sndp, tndp);
+\fB!\fP int (*vn_mkdir)(ndp, vap);
+\fB!\fP int (*vn_rmdir)(ndp);
+\fB!\fP int (*vn_symlink)(ndp, vap, nm);
+\fB!\fP int (*vn_readdir)(vp, uiop,
+ offp, ioflag, cred);
+\fB!\fP int (*vn_readlink)(vp, uiop,
+ offp, ioflag, cred);
+.sp .5
+\fB+\fP int (*vn_abortop)(ndp);
+\fB!\fP int (*vn_inactive)(vp);
+};
+
+.NP 0
+.ta \w'#define\0\0'u +\w'NOFOLLOW\0\0'u +\w'0x40\0\0\0\0\0'u
+/* flags for ioflag */
+#define IO_UNIT 0x01
+#define IO_APPEND 0x02
+#define IO_SYNC 0x04
+.bp
+
+.LL
+Vnode attributes
+
+.NP 0
+.ta .5i +\w'struct timeval\0\0'u +\w'*v_vfsmountedhere;\0\0\0'u
+struct vattr {
+ enum vtype va_type;
+ u_short va_mode;
+\fB!\fP uid_t va_uid;
+\fB!\fP gid_t va_gid;
+ long va_fsid;
+\fB!\fP long va_fileid;
+ short va_nlink;
+ u_long va_size;
+\fB+\fP u_long va_size1;
+ long va_blocksize;
+ struct timeval va_atime;
+ struct timeval va_mtime;
+ struct timeval va_ctime;
+ dev_t va_rdev;
+\fB!\fP u_long va_bytes;
+\fB+\fP u_long va_bytes1;
+};
diff --git a/share/doc/papers/kernmalloc/Makefile b/share/doc/papers/kernmalloc/Makefile
new file mode 100644
index 000000000000..8966f36d8b67
--- /dev/null
+++ b/share/doc/papers/kernmalloc/Makefile
@@ -0,0 +1,11 @@
+# @(#)Makefile 1.8 (Berkeley) 6/8/93
+
+DIR= papers/kernmalloc
+SRCS= kernmalloc.t appendix.t
+MACROS= -ms
+
+paper.ps: ${SRCS} alloc.fig usage.tbl
+ ${SOELIM} ${SRCS} | ${TBL} | ${PIC} | ${EQN} | ${GRIND} | \
+ ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/kernmalloc/alloc.fig b/share/doc/papers/kernmalloc/alloc.fig
new file mode 100644
index 000000000000..1ef260b9ac7c
--- /dev/null
+++ b/share/doc/papers/kernmalloc/alloc.fig
@@ -0,0 +1,115 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)alloc.fig 5.1 (Berkeley) 4/16/91
+.\"
+.PS
+scale=100
+define m0 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 4,12 to 4,4
+line from 8,12 to 8,4
+line from 12,12 to 12,4
+line from 16,12 to 16,4
+line from 20,12 to 20,4
+line from 24,12 to 24,4
+line from 28,12 to 28,4
+line from 0,16 to 0,0
+line from 0,8 to 32,8
+] |
+
+define m1 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 8,12 to 8,4
+line from 16,12 to 16,4
+line from 24,12 to 24,4
+line from 0,8 to 32,8
+line from 0,16 to 0,0
+] |
+
+define m2 |
+[ box invis ht 16 wid 32 with .sw at 0,0
+line from 0,8 to 32,8
+line from 0,16 to 0,0
+] |
+
+define m3 |
+[ box invis ht 16 wid 31 with .sw at 0,0
+line from 15,12 to 15,4
+line from 0,8 to 31,8
+line from 0,16 to 0,0
+] |
+
+box invis ht 212 wid 580 with .sw at 0,0
+"\f1\s10\&kernel memory pages\f1\s0" at 168,204
+"\f1\s10\&Legend:\f1\s0" at 36,144
+"\f1\s10\&cont \- continuation of previous page\f1\s0" at 28,112 ljust
+"\f1\s10\&free \- unused page\f1\s0" at 28,128 ljust
+"\f1\s10\&Usage:\f1\s0" at 34,87
+"\f1\s10\&memsize(addr)\f1\s0" at 36,71 ljust
+"\f1\s10\&char *addr;\f1\s0" at 66,56 ljust
+"\f1\s10\&{\f1\s0" at 36,43 ljust
+"\f1\s10\&return(kmemsizes[(addr \- kmembase) \- \s-1PAGESIZE\s+1]);\f1" at 66,29 ljust
+"\f1\s10\&}\f1\s0" at 36,8 ljust
+line from 548,192 to 548,176
+line from 548,184 to 580,184 dotted
+"\f1\s10\&1024,\f1\s0" at 116,168
+"\f1\s10\&256,\f1\s0" at 148,168
+"\f1\s10\&512,\f1\s0" at 180,168
+"\f1\s10\&3072,\f1\s0" at 212,168
+"\f1\s10\&cont,\f1\s0" at 276,168
+"\f1\s10\&cont,\f1\s0" at 244,168
+"\f1\s10\&128,\f1\s0" at 308,168
+"\f1\s10\&128,\f1\s0" at 340,168
+"\f1\s10\&free,\f1\s0" at 372,168
+"\f1\s10\&cont,\f1\s0" at 404,168
+"\f1\s10\&128,\f1\s0" at 436,168
+"\f1\s10\&1024,\f1\s0" at 468,168
+"\f1\s10\&free,\f1\s0" at 500,168
+"\f1\s10\&cont,\f1\s0" at 532,168
+"\f1\s10\&cont,\f1\s0" at 564,168
+m2 with .nw at 100,192
+m1 with .nw at 132,192
+m3 with .nw at 164,192
+m2 with .nw at 196,192
+m2 with .nw at 228,192
+m2 with .nw at 260,192
+m0 with .nw at 292,192
+m0 with .nw at 324,192
+m2 with .nw at 356,192
+m2 with .nw at 388,192
+m0 with .nw at 420,192
+m2 with .nw at 452,192
+m2 with .nw at 484,192
+m2 with .nw at 516,192
+"\f1\s10\&kmemsizes[] = {\f1\s0" at 100,168 rjust
+"\f1\s10\&char *kmembase\f1\s0" at 97,184 rjust
+.PE
diff --git a/share/doc/papers/kernmalloc/appendix.t b/share/doc/papers/kernmalloc/appendix.t
new file mode 100644
index 000000000000..bcd3e8ce7ef7
--- /dev/null
+++ b/share/doc/papers/kernmalloc/appendix.t
@@ -0,0 +1,137 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)appendix.t 5.1 (Berkeley) 4/16/91
+.\"
+.bp
+.H 1 "Appendix A - Implementation Details"
+.LP
+.nf
+.vS
+/*
+ * Constants for setting the parameters of the kernel memory allocator.
+ *
+ * 2 ** MINBUCKET is the smallest unit of memory that will be
+ * allocated. It must be at least large enough to hold a pointer.
+ *
+ * Units of memory less or equal to MAXALLOCSAVE will permanently
+ * allocate physical memory; requests for these size pieces of memory
+ * are quite fast. Allocations greater than MAXALLOCSAVE must
+ * always allocate and free physical memory; requests for these size
+ * allocations should be done infrequently as they will be slow.
+ * Constraints: CLBYTES <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14)
+ * and MAXALLOCSIZE must be a power of two.
+ */
+#define MINBUCKET 4 /* 4 => min allocation of 16 bytes */
+#define MAXALLOCSAVE (2 * CLBYTES)
+
+/*
+ * Maximum amount of kernel dynamic memory.
+ * Constraints: must be a multiple of the pagesize.
+ */
+#define MAXKMEM (1024 * PAGESIZE)
+
+/*
+ * Arena for all kernel dynamic memory allocation.
+ * This arena is known to start on a page boundary.
+ */
+extern char kmembase[MAXKMEM];
+
+/*
+ * Array of descriptors that describe the contents of each page
+ */
+struct kmemsizes {
+ short ks_indx; /* bucket index, size of small allocations */
+ u_short ks_pagecnt; /* for large allocations, pages allocated */
+} kmemsizes[MAXKMEM / PAGESIZE];
+
+/*
+ * Set of buckets for each size of memory block that is retained
+ */
+struct kmembuckets {
+ caddr_t kb_next; /* list of free blocks */
+} bucket[MINBUCKET + 16];
+.bp
+/*
+ * Macro to convert a size to a bucket index. If the size is constant,
+ * this macro reduces to a compile time constant.
+ */
+#define MINALLOCSIZE (1 << MINBUCKET)
+#define BUCKETINDX(size) \
+ (size) <= (MINALLOCSIZE * 128) \
+ ? (size) <= (MINALLOCSIZE * 8) \
+ ? (size) <= (MINALLOCSIZE * 2) \
+ ? (size) <= (MINALLOCSIZE * 1) \
+ ? (MINBUCKET + 0) \
+ : (MINBUCKET + 1) \
+ : (size) <= (MINALLOCSIZE * 4) \
+ ? (MINBUCKET + 2) \
+ : (MINBUCKET + 3) \
+ : (size) <= (MINALLOCSIZE* 32) \
+ ? (size) <= (MINALLOCSIZE * 16) \
+ ? (MINBUCKET + 4) \
+ : (MINBUCKET + 5) \
+ : (size) <= (MINALLOCSIZE * 64) \
+ ? (MINBUCKET + 6) \
+ : (MINBUCKET + 7) \
+ : (size) <= (MINALLOCSIZE * 2048) \
+ /* etc ... */
+
+/*
+ * Macro versions for the usual cases of malloc/free
+ */
+#define MALLOC(space, cast, size, flags) { \
+ register struct kmembuckets *kbp = &bucket[BUCKETINDX(size)]; \
+ long s = splimp(); \
+ if (kbp->kb_next == NULL) { \
+ (space) = (cast)malloc(size, flags); \
+ } else { \
+ (space) = (cast)kbp->kb_next; \
+ kbp->kb_next = *(caddr_t *)(space); \
+ } \
+ splx(s); \
+}
+
+#define FREE(addr) { \
+ register struct kmembuckets *kbp; \
+ register struct kmemsizes *ksp = \
+ &kmemsizes[((addr) - kmembase) / PAGESIZE]; \
+ long s = splimp(); \
+ if (1 << ksp->ks_indx > MAXALLOCSAVE) { \
+ free(addr); \
+ } else { \
+ kbp = &bucket[ksp->ks_indx]; \
+ *(caddr_t *)(addr) = kbp->kb_next; \
+ kbp->kb_next = (caddr_t)(addr); \
+ } \
+ splx(s); \
+}
+.vE
diff --git a/share/doc/papers/kernmalloc/kernmalloc.t b/share/doc/papers/kernmalloc/kernmalloc.t
new file mode 100644
index 000000000000..62df1b410657
--- /dev/null
+++ b/share/doc/papers/kernmalloc/kernmalloc.t
@@ -0,0 +1,649 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)kernmalloc.t 5.1 (Berkeley) 4/16/91
+.\"
+.\" reference a system routine name
+.de RN
+\fI\\$1\fP\^(\h'1m/24u')\\$2
+..
+.\" reference a header name
+.de H
+.NH \\$1
+\\$2
+..
+.\" begin figure
+.\" .FI "title"
+.nr Fn 0 1
+.de FI
+.ds Lb Figure \\n+(Fn
+.ds Lt \\$1
+.KF
+.DS B
+.nf
+..
+.\"
+.\" end figure
+.de Fe
+.sp .5
+.\" cheat: original indent is stored in \n(OI by .DS B; restore it
+.\" then center legend after .DE rereads and centers the block.
+\\\\.in \\n(OI
+\\\\.ce
+\\\\*(Lb. \\\\*(Lt
+.sp .5
+.DE
+.KE
+.if \nd 'ls 2
+..
+.EQ
+delim $$
+.EN
+.ds CH "
+.pn 295
+.sp
+.rs
+.ps -1
+.sp -1
+.fi
+Reprinted from:
+\fIProceedings of the San Francisco USENIX Conference\fP,
+pp. 295-303, June 1988.
+.ps
+.\".sp |\n(HMu
+.rm CM
+.nr PO 1.25i
+.TL
+Design of a General Purpose Memory Allocator for the 4.3BSD UNIX\(dg Kernel
+.ds LF Summer USENIX '88
+.ds CF "%
+.ds RF San Francisco, June 20-24
+.EH 'Design of a General Purpose Memory ...''McKusick, Karels'
+.OH 'McKusick, Karels''Design of a General Purpose Memory ...'
+.FS
+\(dgUNIX is a registered trademark of AT&T in the US and other countries.
+.FE
+.AU
+Marshall Kirk McKusick
+.AU
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+The 4.3BSD UNIX kernel uses many memory allocation mechanisms,
+each designed for the particular needs of the utilizing subsystem.
+This paper describes a general purpose dynamic memory allocator
+that can be used by all of the kernel subsystems.
+The design of this allocator takes advantage of known memory usage
+patterns in the UNIX kernel and a hybrid strategy that is time-efficient
+for small allocations and space-efficient for large allocations.
+This allocator replaces the multiple memory allocation interfaces
+with a single easy-to-program interface,
+results in more efficient use of global memory by eliminating
+partitioned and specialized memory pools,
+and is quick enough that no performance loss is observed
+relative to the current implementations.
+The paper concludes with a discussion of our experience in using
+the new memory allocator,
+and directions for future work.
+.AE
+.LP
+.H 1 "Kernel Memory Allocation in 4.3BSD
+.PP
+The 4.3BSD kernel has at least ten different memory allocators.
+Some of them handle large blocks,
+some of them handle small chained data structures,
+and others include information to describe I/O operations.
+Often the allocations are for small pieces of memory that are only
+needed for the duration of a single system call.
+In a user process such short-term
+memory would be allocated on the run-time stack.
+Because the kernel has a limited run-time stack,
+it is not feasible to allocate even moderate blocks of memory on it.
+Consequently, such memory must be allocated through a more dynamic mechanism.
+For example,
+when the system must translate a pathname,
+it must allocate a one kilobye buffer to hold the name.
+Other blocks of memory must be more persistent than a single system call
+and really have to be allocated from dynamic memory.
+Examples include protocol control blocks that remain throughout
+the duration of the network connection.
+.PP
+Demands for dynamic memory allocation in the kernel have increased
+as more services have been added.
+Each time a new type of memory allocation has been required,
+a specialized memory allocation scheme has been written to handle it.
+Often the new memory allocation scheme has been built on top
+of an older allocator.
+For example, the block device subsystem provides a crude form of
+memory allocation through the allocation of empty buffers [Thompson78].
+The allocation is slow because of the implied semantics of
+finding the oldest buffer, pushing its contents to disk if they are dirty,
+and moving physical memory into or out of the buffer to create
+the requested size.
+To reduce the overhead, a ``new'' memory allocator was built in 4.3BSD
+for name translation that allocates a pool of empty buffers.
+It keeps them on a free list so they can
+be quickly allocated and freed [McKusick85].
+.PP
+This memory allocation method has several drawbacks.
+First, the new allocator can only handle a limited range of sizes.
+Second, it depletes the buffer pool, as it steals memory intended
+to buffer disk blocks to other purposes.
+Finally, it creates yet another interface of
+which the programmer must be aware.
+.PP
+A generalized memory allocator is needed to reduce the complexity
+of writing code inside the kernel.
+Rather than providing many semi-specialized ways of allocating memory,
+the kernel should provide a single general purpose allocator.
+With only a single interface,
+programmers do not need to figure
+out the most appropriate way to allocate memory.
+If a good general purpose allocator is available,
+it helps avoid the syndrome of creating yet another special
+purpose allocator.
+.PP
+To ease the task of understanding how to use it,
+the memory allocator should have an interface similar to the interface
+of the well-known memory allocator provided for
+applications programmers through the C library routines
+.RN malloc
+and
+.RN free .
+Like the C library interface,
+the allocation routine should take a parameter specifying the
+size of memory that is needed.
+The range of sizes for memory requests should not be constrained.
+The free routine should take a pointer to the storage being freed,
+and should not require additional information such as the size
+of the piece of memory being freed.
+.H 1 "Criteria for a Kernel Memory Allocator
+.PP
+The design specification for a kernel memory allocator is similar to,
+but not identical to,
+the design criteria for a user level memory allocator.
+The first criterion for a memory allocator is that it make good use
+of the physical memory.
+Good use of memory is measured by the amount of memory needed to hold
+a set of allocations at any point in time.
+Percentage utilization is expressed as:
+.EQ
+utilization~=~requested over required
+.EN
+Here, ``requested'' is the sum of the memory that has been requested
+and not yet freed.
+``Required'' is the amount of memory that has been
+allocated for the pool from which the requests are filled.
+An allocator requires more memory than requested because of fragmentation
+and a need to have a ready supply of free memory for future requests.
+A perfect memory allocator would have a utilization of 100%.
+In practice,
+having a 50% utilization is considered good [Korn85].
+.PP
+Good memory utilization in the kernel is more important than
+in user processes.
+Because user processes run in virtual memory,
+unused parts of their address space can be paged out.
+Thus pages in the process address space
+that are part of the ``required'' pool that are not
+being ``requested'' need not tie up physical memory.
+Because the kernel is not paged,
+all pages in the ``required'' pool are held by the kernel and
+cannot be used for other purposes.
+To keep the kernel utilization percentage as high as possible,
+it is desirable to release unused memory in the ``required'' pool
+rather than to hold it as is typically done with user processes.
+Because the kernel can directly manipulate its own page maps,
+releasing unused memory is fast;
+a user process must do a system call to release memory.
+.PP
+The most important criterion for a memory allocator is that it be fast.
+Because memory allocation is done frequently,
+a slow memory allocator will degrade the system performance.
+Speed of allocation is more critical when executing in the
+kernel than in user code,
+because the kernel must allocate many data structure that user
+processes can allocate cheaply on their run-time stack.
+In addition, the kernel represents the platform on which all user
+processes run,
+and if it is slow, it will degrade the performance of every process
+that is running.
+.PP
+Another problem with a slow memory allocator is that programmers
+of frequently-used kernel interfaces will feel that they
+cannot afford to use it as their primary memory allocator.
+Instead they will build their own memory allocator on top of the
+original by maintaining their own pool of memory blocks.
+Multiple allocators reduce the efficiency with which memory is used.
+The kernel ends up with many different free lists of memory
+instead of a single free list from which all allocation can be drawn.
+For example,
+consider the case of two subsystems that need memory.
+If they have their own free lists,
+the amount of memory tied up in the two lists will be the
+sum of the greatest amount of memory that each of
+the two subsystems has ever used.
+If they share a free list,
+the amount of memory tied up in the free list may be as low as the
+greatest amount of memory that either subsystem used.
+As the number of subsystems grows,
+the savings from having a single free list grow.
+.H 1 "Existing User-level Implementations
+.PP
+There are many different algorithms and
+implementations of user-level memory allocators.
+A survey of those available on UNIX systems appeared in [Korn85].
+Nearly all of the memory allocators tested made good use of memory,
+though most of them were too slow for use in the kernel.
+The fastest memory allocator in the survey by nearly a factor of two
+was the memory allocator provided on 4.2BSD originally
+written by Chris Kingsley at California Institute of Technology.
+Unfortunately,
+the 4.2BSD memory allocator also wasted twice as much memory
+as its nearest competitor in the survey.
+.PP
+The 4.2BSD user-level memory allocator works by maintaining a set of lists
+that are ordered by increasing powers of two.
+Each list contains a set of memory blocks of its corresponding size.
+To fulfill a memory request,
+the size of the request is rounded up to the next power of two.
+A piece of memory is then removed from the list corresponding
+to the specified power of two and returned to the requester.
+Thus, a request for a block of memory of size 53 returns
+a block from the 64-sized list.
+A typical memory allocation requires a roundup calculation
+followed by a linked list removal.
+Only if the list is empty is a real memory allocation done.
+The free operation is also fast;
+the block of memory is put back onto the list from which it came.
+The correct list is identified by a size indicator stored
+immediately preceding the memory block.
+.H 1 "Considerations Unique to a Kernel Allocator
+.PP
+There are several special conditions that arise when writing a
+memory allocator for the kernel that do not apply to a user process
+memory allocator.
+First, the maximum memory allocation can be determined at
+the time that the machine is booted.
+This number is never more than the amount of physical memory on the machine,
+and is typically much less since a machine with all its
+memory dedicated to the operating system is uninteresting to use.
+Thus, the kernel can statically allocate a set of data structures
+to manage its dynamically allocated memory.
+These data structures never need to be
+expanded to accommodate memory requests;
+yet, if properly designed, they need not be large.
+For a user process, the maximum amount of memory that may be allocated
+is a function of the maximum size of its virtual memory.
+Although it could allocate static data structures to manage
+its entire virtual memory,
+even if they were efficiently encoded they would potentially be huge.
+The other alternative is to allocate data structures as they are needed.
+However, that adds extra complications such as new
+failure modes if it cannot allocate space for additional
+structures and additional mechanisms to link them all together.
+.PP
+Another special condition of the kernel memory allocator is that it
+can control its own address space.
+Unlike user processes that can only grow and shrink their heap at one end,
+the kernel can keep an arena of kernel addresses and allocate
+pieces from that arena which it then populates with physical memory.
+The effect is much the same as a user process that has parts of
+its address space paged out when they are not in use,
+except that the kernel can explicitly control the set of pages
+allocated to its address space.
+The result is that the ``working set'' of pages in use by the
+kernel exactly corresponds to the set of pages that it is really using.
+.FI "One day memory usage on a Berkeley time-sharing machine"
+.so usage.tbl
+.Fe
+.PP
+A final special condition that applies to the kernel is that
+all of the different uses of dynamic memory are known in advance.
+Each one of these uses of dynamic memory can be assigned a type.
+For each type of dynamic memory that is allocated,
+the kernel can provide allocation limits.
+One reason given for having separate allocators is that
+no single allocator could starve the rest of the kernel of all
+its available memory and thus a single runaway
+client could not paralyze the system.
+By putting limits on each type of memory,
+the single general purpose memory allocator can provide the same
+protection against memory starvation.\(dg
+.FS
+\(dgOne might seriously ask the question what good it is if ``only''
+one subsystem within the kernel hangs if it is something like the
+network on a diskless workstation.
+.FE
+.PP
+\*(Lb shows the memory usage of the kernel over a one day period
+on a general timesharing machine at Berkeley.
+The ``In Use'', ``Free'', and ``Mem Use'' fields are instantaneous values;
+the ``Requests'' field is the number of allocations since system startup;
+the ``High Use'' field is the maximum value of
+the ``Mem Use'' field since system startup.
+The figure demonstrates that most
+allocations are for small objects.
+Large allocations occur infrequently,
+and are typically for long-lived objects
+such as buffers to hold the superblock for
+a mounted file system.
+Thus, a memory allocator only needs to be
+fast for small pieces of memory.
+.H 1 "Implementation of the Kernel Memory Allocator
+.PP
+In reviewing the available memory allocators,
+none of their strategies could be used without some modification.
+The kernel memory allocator that we ended up with is a hybrid
+of the fast memory allocator found in the 4.2BSD C library
+and a slower but more-memory-efficient first-fit allocator.
+.PP
+Small allocations are done using the 4.2BSD power-of-two list strategy;
+the typical allocation requires only a computation of
+the list to use and the removal of an element if it is available,
+so it is quite fast.
+Macros are provided to avoid the cost of a subroutine call.
+Only if the request cannot be fulfilled from a list is a call
+made to the allocator itself.
+To ensure that the allocator is always called for large requests,
+the lists corresponding to large allocations are always empty.
+Appendix A shows the data structures and implementation of the macros.
+.PP
+Similarly, freeing a block of memory can be done with a macro.
+The macro computes the list on which to place the request
+and puts it there.
+The free routine is called only if the block of memory is
+considered to be a large allocation.
+Including the cost of blocking out interrupts,
+the allocation and freeing macros generate respectively
+only nine and sixteen (simple) VAX instructions.
+.PP
+Because of the inefficiency of power-of-two allocation strategies
+for large allocations,
+a different strategy is used for allocations larger than two kilobytes.
+The selection of two kilobytes is derived from our statistics on
+the utilization of memory within the kernel,
+that showed that 95 to 98% of allocations are of size one kilobyte or less.
+A frequent caller of the memory allocator
+(the name translation function)
+always requests a one kilobyte block.
+Additionally the allocation method for large blocks is based on allocating
+pieces of memory in multiples of pages.
+Consequently the actual allocation size for requests of size
+$2~times~pagesize$ or less are identical.\(dg
+.FS
+\(dgTo understand why this number is $size 8 {2~times~pagesize}$ one
+observes that the power-of-two algorithm yields sizes of 1, 2, 4, 8, \&...
+pages while the large block algorithm that allocates in multiples
+of pages yields sizes of 1, 2, 3, 4, \&... pages.
+Thus for allocations of sizes between one and two pages
+both algorithms use two pages;
+it is not until allocations of sizes between two and three pages
+that a difference emerges where the power-of-two algorithm will use
+four pages while the large block algorithm will use three pages.
+.FE
+In 4.3BSD on the VAX, the (software) page size is one kilobyte,
+so two kilobytes is the smallest logical cutoff.
+.PP
+Large allocations are first rounded up to be a multiple of the page size.
+The allocator then uses a first-fit algorithm to find space in the
+kernel address arena set aside for dynamic allocations.
+Thus a request for a five kilobyte piece of memory will use exactly
+five pages of memory rather than eight kilobytes as with
+the power-of-two allocation strategy.
+When a large piece of memory is freed,
+the memory pages are returned to the free memory pool,
+and the address space is returned to the kernel address arena
+where it is coalesced with adjacent free pieces.
+.PP
+Another technique to improve both the efficiency of memory utilization
+and the speed of allocation
+is to cluster same-sized small allocations on a page.
+When a list for a power-of-two allocation is empty,
+a new page is allocated and divided into pieces of the needed size.
+This strategy speeds future allocations as several pieces of memory
+become available as a result of the call into the allocator.
+.PP
+.FI "Calculation of allocation size"
+.so alloc.fig
+.Fe
+Because the size is not specified when a block of memory is freed,
+the allocator must keep track of the sizes of the pieces it has handed out.
+The 4.2BSD user-level allocator stores the size of each block
+in a header just before the allocation.
+However, this strategy doubles the memory requirement for allocations that
+require a power-of-two-sized block.
+Therefore,
+instead of storing the size of each piece of memory with the piece itself,
+the size information is associated with the memory page.
+\*(Lb shows how the kernel determines
+the size of a piece of memory that is being freed,
+by calculating the page in which it resides,
+and looking up the size associated with that page.
+Eliminating the cost of the overhead per piece improved utilization
+far more than expected.
+The reason is that many allocations in the kernel are for blocks of
+memory whose size is exactly a power of two.
+These requests would be nearly doubled if the user-level strategy were used.
+Now they can be accommodated with no wasted memory.
+.PP
+The allocator can be called both from the top half of the kernel,
+which is willing to wait for memory to become available,
+and from the interrupt routines in the bottom half of the kernel
+that cannot wait for memory to become available.
+Clients indicate their willingness (and ability) to wait with a flag
+to the allocation routine.
+For clients that are willing to wait,
+the allocator guarrentees that their request will succeed.
+Thus, these clients can need not check the return value from the allocator.
+If memory is unavailable and the client cannot wait,
+the allocator returns a null pointer.
+These clients must be prepared to cope with this
+(hopefully infrequent) condition
+(usually by giving up and hoping to do better later).
+.H 1 "Results of the Implementation
+.PP
+The new memory allocator was written about a year ago.
+Conversion from the old memory allocators to the new allocator
+has been going on ever since.
+Many of the special purpose allocators have been eliminated.
+This list includes
+.RN calloc ,
+.RN wmemall ,
+and
+.RN zmemall .
+Many of the special purpose memory allocators built on
+top of other allocators have also been eliminated.
+For example, the allocator that was built on top of the buffer pool allocator
+.RN geteblk
+to allocate pathname buffers in
+.RN namei
+has been eliminated.
+Because the typical allocation is so fast,
+we have found that none of the special purpose pools are needed.
+Indeed, the allocation is about the same as the previous cost of
+allocating buffers from the network pool (\fImbuf\fP\^s).
+Consequently applications that used to allocate network
+buffers for their own uses have been switched over to using
+the general purpose allocator without increasing their running time.
+.PP
+Quantifying the performance of the allocator is difficult because
+it is hard to measure the amount of time spent allocating
+and freeing memory in the kernel.
+The usual approach is to compile a kernel for profiling
+and then compare the running time of the routines that
+implemented the old abstraction versus those that implement the new one.
+The old routines are difficult to quantify because
+individual routines were used for more than one purpose.
+For example, the
+.RN geteblk
+routine was used both to allocate one kilobyte memory blocks
+and for its intended purpose of providing buffers to the filesystem.
+Differentiating these uses is often difficult.
+To get a measure of the cost of memory allocation before
+putting in our new allocator,
+we summed up the running time of all the routines whose
+exclusive task was memory allocation.
+To this total we added the fraction
+of the running time of the multi-purpose routines that could
+clearly be identified as memory allocation usage.
+This number showed that approximately three percent of
+the time spent in the kernel could be accounted to memory allocation.
+.PP
+The new allocator is difficult to measure
+because the usual case of the memory allocator is implemented as a macro.
+Thus, its running time is a small fraction of the running time of the
+numerous routines in the kernel that use it.
+To get a bound on the cost,
+we changed the macro always to call the memory allocation routine.
+Running in this mode, the memory allocator accounted for six percent
+of the time spent in the kernel.
+Factoring out the cost of the statistics collection and the
+subroutine call overhead for the cases that could
+normally be handled by the macro,
+we estimate that the allocator would account for
+at most four percent of time in the kernel.
+These measurements show that the new allocator does not introduce
+significant new run-time costs.
+.PP
+The other major success has been in keeping the size information
+on a per-page basis.
+This technique allows the most frequently requested sizes to be
+allocated without waste.
+It also reduces the amount of bookkeeping information associated
+with the allocator to four kilobytes of information
+per megabyte of memory under management (with a one kilobyte page size).
+.H 1 "Future Work
+.PP
+Our next project is to convert many of the static
+kernel tables to be dynamically allocated.
+Static tables include the process table, the file table,
+and the mount table.
+Making these tables dynamic will have two benefits.
+First, it will reduce the amount of memory
+that must be statically allocated at boot time.
+Second, it will eliminate the arbitrary upper limit imposed
+by the current static sizing
+(although a limit will be retained to constrain runaway clients).
+Other researchers have already shown the memory savings
+achieved by this conversion [Rodriguez88].
+.PP
+Under the current implementation,
+memory is never moved from one size list to another.
+With the 4.2BSD memory allocator this causes problems,
+particularly for large allocations where a process may use
+a quarter megabyte piece of memory once,
+which is then never available for any other size request.
+In our hybrid scheme,
+memory can be shuffled between large requests so that large blocks
+of memory are never stranded as they are with the 4.2BSD allocator.
+However, pages allocated to small requests are allocated once
+to a particular size and never changed thereafter.
+If a burst of requests came in for a particular size,
+that size would acquire a large amount of memory
+that would then not be available for other future requests.
+.PP
+In practice, we do not find that the free lists become too large.
+However, we have been investigating ways to handle such problems
+if they occur in the future.
+Our current investigations involve a routine
+that can run as part of the idle loop that would sort the elements
+on each of the free lists into order of increasing address.
+Since any given page has only one size of elements allocated from it,
+the effect of the sorting would be to sort the list into distinct pages.
+When all the pieces of a page became free,
+the page itself could be released back to the free pool so that
+it could be allocated to another purpose.
+Although there is no guarantee that all the pieces of a page would ever
+be freed,
+most allocations are short-lived, lasting only for the duration of
+an open file descriptor, an open network connection, or a system call.
+As new allocations would be made from the page sorted to
+the front of the list,
+return of elements from pages at the back would eventually
+allow pages later in the list to be freed.
+.PP
+Two of the traditional UNIX
+memory allocators remain in the current system.
+The terminal subsystem uses \fIclist\fP\^s (character lists).
+That part of the system is expected to undergo major revision within
+the the next year or so, and it will probably be changed to use
+\fImbuf\fP\^s as it is merged into the network system.
+The other major allocator that remains is
+.RN getblk ,
+the routine that manages the filesystem buffer pool memory
+and associated control information.
+Only the filesystem uses
+.RN getblk
+in the current system;
+it manages the constant-sized buffer pool.
+We plan to merge the filesystem buffer cache into the virtual memory system's
+page cache in the future.
+This change will allow the size of the buffer pool to be changed
+according to memory load,
+but will require a policy for balancing memory needs
+with filesystem cache performance.
+.H 1 "Acknowledgments
+.PP
+In the spirit of community support,
+we have made various versions of our allocator available to our test sites.
+They have been busily burning it in and giving
+us feedback on their experiences.
+We acknowledge their invaluable input.
+The feedback from the Usenix program committee on the initial draft of
+our paper suggested numerous important improvements.
+.H 1 "References
+.LP
+.IP Korn85 \w'Rodriguez88\0\0'u
+David Korn, Kiem-Phong Vo,
+``In Search of a Better Malloc''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 489-506, June 1985.
+.IP McKusick85
+M. McKusick, M. Karels, S. Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 519-531, June 1985.
+.IP Rodriguez88
+Robert Rodriguez, Matt Koehler, Larry Palmer, Ricky Palmer,
+``A Dynamic UNIX Operating System''
+\fIProceedings of the San Francisco Usenix Conference\fP,
+June 1988.
+.IP Thompson78
+Ken Thompson,
+``UNIX Implementation''
+\fIBell System Technical Journal\fP, volume 57, number 6,
+pp 1931-1946, 1978.
diff --git a/share/doc/papers/kernmalloc/spell.ok b/share/doc/papers/kernmalloc/spell.ok
new file mode 100644
index 000000000000..10c3ab7d8ed4
--- /dev/null
+++ b/share/doc/papers/kernmalloc/spell.ok
@@ -0,0 +1,57 @@
+BUCKETINDX
+CLBYTES
+CM
+Karels
+Kiem
+Koehler
+Korn
+Korn85
+MAXALLOCSAVE
+MAXALLOCSIZE
+MAXKMEM
+MINALLOCSIZE
+MINBUCKET
+Matt
+McKusick
+McKusick85
+Mem
+Phong
+Ricky
+Rodriguez88
+S.Leffler
+Thompson78
+ULTRIX
+Usenix
+VAX
+Vo
+arptbl
+caddr
+devbuf
+extern
+fragtbl
+freelist
+geteblk
+indx
+ioctlops
+kb
+kbp
+kmembase
+kmembuckets
+kmemsizes
+ks
+ksp
+mbuf
+mbufs
+namei
+pagecnt
+pathname
+pcb
+pp
+routetbl
+runtime
+splimp
+splx
+superblk
+temp
+wmemall
+zmemall
diff --git a/share/doc/papers/kernmalloc/usage.tbl b/share/doc/papers/kernmalloc/usage.tbl
new file mode 100644
index 000000000000..c5ebdfee0508
--- /dev/null
+++ b/share/doc/papers/kernmalloc/usage.tbl
@@ -0,0 +1,75 @@
+.\" Copyright (c) 1988 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)usage.tbl 5.1 (Berkeley) 4/16/91
+.\"
+.TS
+box;
+c s s s
+c c c c
+n n n n.
+Memory statistics by bucket size
+=
+Size In Use Free Requests
+_
+128 329 39 3129219
+256 0 0 0
+512 4 0 16
+1024 17 5 648771
+2048 13 0 13
+2049\-4096 0 0 157
+4097\-8192 2 0 103
+8193\-16384 0 0 0
+16385\-32768 1 0 1
+.TE
+.DE
+.DS B
+.TS
+box;
+c s s s s
+c c c c c
+c n n n n.
+Memory statistics by type
+=
+Type In Use Mem Use High Use Requests
+_
+mbuf 6 1K 17K 3099066
+devbuf 13 53K 53K 13
+socket 37 5K 6K 1275
+pcb 55 7K 8K 1512
+routetbl 229 29K 29K 2424
+fragtbl 0 0K 1K 404
+zombie 3 1K 1K 24538
+namei 0 0K 5K 648754
+ioctlops 0 0K 1K 12
+superblk 24 34K 34K 24
+temp 0 0K 8K 258
+.TE
diff --git a/share/doc/papers/kerntune/0.t b/share/doc/papers/kerntune/0.t
new file mode 100644
index 000000000000..90fa2bf3a934
--- /dev/null
+++ b/share/doc/papers/kerntune/0.t
@@ -0,0 +1,129 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)0.t 1.2 (Berkeley) 11/8/90
+.\"
+.EQ
+delim $$
+.EN
+.if n .ND
+.TL
+Using gprof to Tune the 4.2BSD Kernel
+.AU
+Marshall Kirk McKusick
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+This paper describes how the \fIgprof\fP profiler
+accounts for the running time of called routines
+in the running time of the routines that call them.
+It then explains how to configure a profiling kernel on
+the 4.2 Berkeley Software Distribution of
+.UX
+for the VAX\(dd
+.FS
+\(dd VAX is a trademark of Digital Equipment Corporation.
+.FE
+and discusses tradeoffs in techniques for collecting
+profile data.
+\fIGprof\fP identifies problems
+that severely affects the overall performance of the kernel.
+Once a potential problem areas is identified
+benchmark programs are devised to highlight the bottleneck.
+These benchmarks verify that the problem exist and provide
+a metric against which to validate proposed solutions.
+Two caches are added to the kernel to alleviate the bottleneck
+and \fIgprof\fP is used to validates their effectiveness.
+.AE
+.LP
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH 4.2BSD Performance
+.ds RH Contents
+.bp 1
+.if t .ds CF May 21, 1984
+.if t .ds LF
+.if t .ds RF McKusick
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Introduction"
+.LP
+.sp .5v
+.nf
+.B "2. The \fIgprof\fP Profiler"
+\0.1. Data Presentation"
+\0.1.1. The Flat Profile
+\0.1.2. The Call Graph Profile
+\0.2 Profiling the Kernel
+.LP
+.sp .5v
+.nf
+.B "3. Using \fIgprof\fP to Improve Performance
+\0.1. Using the Profiler
+\0.2. An Example of Tuning
+.LP
+.sp .5v
+.nf
+.B "4. Conclusions"
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.af PN 1
+.bp 1
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/kerntune/1.t b/share/doc/papers/kerntune/1.t
new file mode 100644
index 000000000000..d78c5685e3ec
--- /dev/null
+++ b/share/doc/papers/kerntune/1.t
@@ -0,0 +1,48 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)1.t 1.2 (Berkeley) 11/8/90
+.\"
+.ds RH Introduction
+.NH 1
+Introduction
+.PP
+The purpose of this paper is to describe the tools and techniques
+that are available for improving the performance of the the kernel.
+The primary tool used to measure the kernel is the hierarchical
+profiler \fIgprof\fP.
+The profiler enables the user to measure the cost of
+the abstractions that the kernel provides to the user.
+Once the expensive abstractions are identified,
+optimizations are postulated to help improve their performance.
+These optimizations are each individually
+verified to insure that they are producing a measurable improvement.
diff --git a/share/doc/papers/kerntune/2.t b/share/doc/papers/kerntune/2.t
new file mode 100644
index 000000000000..2857dc29ad5b
--- /dev/null
+++ b/share/doc/papers/kerntune/2.t
@@ -0,0 +1,234 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)2.t 1.3 (Berkeley) 11/8/90
+.\"
+.ds RH The \fIgprof\fP Profiler
+.NH 1
+The \fIgprof\fP Profiler
+.PP
+The purpose of the \fIgprof\fP profiling tool is to
+help the user evaluate alternative implementations
+of abstractions.
+The \fIgprof\fP design takes advantage of the fact that the kernel
+though large, is structured and hierarchical.
+We provide a profile in which the execution time
+for a set of routines that implement an
+abstraction is collected and charged
+to that abstraction.
+The profile can be used to compare and assess the costs of
+various implementations [Graham82] [Graham83].
+.NH 2
+Data presentation
+.PP
+The data is presented to the user in two different formats.
+The first presentation simply lists the routines
+without regard to the amount of time their descendants use.
+The second presentation incorporates the call graph of the
+kernel.
+.NH 3
+The Flat Profile
+.PP
+The flat profile consists of a list of all the routines
+that are called during execution of the kernel,
+with the count of the number of times they are called
+and the number of seconds of execution time for which they
+are themselves accountable.
+The routines are listed in decreasing order of execution time.
+A list of the routines that are never called during execution of
+the kernel is also available
+to verify that nothing important is omitted by
+this profiling run.
+The flat profile gives a quick overview of the routines that are used,
+and shows the routines that are themselves responsible
+for large fractions of the execution time.
+In practice,
+this profile usually shows that no single function
+is overwhelmingly responsible for
+the total time of the kernel.
+Notice that for this profile,
+the individual times sum to the total execution time.
+.NH 3
+The Call Graph Profile
+.PP
+Ideally, we would like to print the call graph of the kernel,
+but we are limited by the two-dimensional nature of our output
+devices.
+We cannot assume that a call graph is planar,
+and even if it is, that we can print a planar version of it.
+Instead, we choose to list each routine,
+together with information about
+the routines that are its direct parents and children.
+This listing presents a window into the call graph.
+Based on our experience,
+both parent information and child information
+is important,
+and should be available without searching
+through the output.
+Figure 1 shows a sample \fIgprof\fP entry.
+.KF
+.DS L
+.TS
+box center;
+c c c c c l l
+c c c c c l l
+c c c c c l l
+l n n n c l l.
+ called/total \ \ parents
+index %time self descendants called+self name index
+ called/total \ \ children
+_
+ 0.20 1.20 4/10 \ \ \s-1CALLER1\s+1 [7]
+ 0.30 1.80 6/10 \ \ \s-1CALLER2\s+1 [1]
+[2] 41.5 0.50 3.00 10+4 \s-1EXAMPLE\s+1 [2]
+ 1.50 1.00 20/40 \ \ \s-1SUB1\s+1 <cycle1> [4]
+ 0.00 0.50 1/5 \ \ \s-1SUB2\s+1 [9]
+ 0.00 0.00 0/5 \ \ \s-1SUB3\s+1 [11]
+.TE
+.ce
+Figure 1. Profile entry for \s-1EXAMPLE\s+1.
+.DE
+.KE
+.PP
+The major entries of the call graph profile are the entries from the
+flat profile, augmented by the time propagated to each
+routine from its descendants.
+This profile is sorted by the sum of the time for the routine
+itself plus the time inherited from its descendants.
+The profile shows which of the higher level routines
+spend large portions of the total execution time
+in the routines that they call.
+For each routine, we show the amount of time passed by each child
+to the routine, which includes time for the child itself
+and for the descendants of the child
+(and thus the descendants of the routine).
+We also show the percentage these times represent of the total time
+accounted to the child.
+Similarly, the parents of each routine are listed,
+along with time,
+and percentage of total routine time,
+propagated to each one.
+.PP
+Cycles are handled as single entities.
+The cycle as a whole is shown as though it were a single routine,
+except that members of the cycle are listed in place of the children.
+Although the number of calls of each member
+from within the cycle are shown,
+they do not affect time propagation.
+When a child is a member of a cycle,
+the time shown is the appropriate fraction of the time
+for the whole cycle.
+Self-recursive routines have their calls broken
+down into calls from the outside and self-recursive calls.
+Only the outside calls affect the propagation of time.
+.PP
+The example shown in Figure 2 is the fragment of a call graph
+corresponding to the entry in the call graph profile listing
+shown in Figure 1.
+.KF
+.DS L
+.so fig2.pic
+.ce
+Figure 2. Example call graph fragment.
+.DE
+.KE
+.PP
+The entry is for routine \s-1EXAMPLE\s+1, which has
+the Caller routines as its parents,
+and the Sub routines as its children.
+The reader should keep in mind that all information
+is given \fIwith respect to \s-1EXAMPLE\s+1\fP.
+The index in the first column shows that \s-1EXAMPLE\s+1
+is the second entry in the profile listing.
+The \s-1EXAMPLE\s+1 routine is called ten times, four times by \s-1CALLER1\s+1,
+and six times by \s-1CALLER2\s+1.
+Consequently 40% of \s-1EXAMPLE\s+1's time is propagated to \s-1CALLER1\s+1,
+and 60% of \s-1EXAMPLE\s+1's time is propagated to \s-1CALLER2\s+1.
+The self and descendant fields of the parents
+show the amount of self and descendant time \s-1EXAMPLE\s+1
+propagates to them (but not the time used by
+the parents directly).
+Note that \s-1EXAMPLE\s+1 calls itself recursively four times.
+The routine \s-1EXAMPLE\s+1 calls routine \s-1SUB1\s+1 twenty times, \s-1SUB2\s+1 once,
+and never calls \s-1SUB3\s+1.
+Since \s-1SUB2\s+1 is called a total of five times,
+20% of its self and descendant time is propagated to \s-1EXAMPLE\s+1's
+descendant time field.
+Because \s-1SUB1\s+1 is a member of \fIcycle 1\fR,
+the self and descendant times
+and call count fraction
+are those for the cycle as a whole.
+Since cycle 1 is called a total of forty times
+(not counting calls among members of the cycle),
+it propagates 50% of the cycle's self and descendant
+time to \s-1EXAMPLE\s+1's descendant time field.
+Finally each name is followed by an index that shows
+where on the listing to find the entry for that routine.
+.NH 2
+Profiling the Kernel
+.PP
+It is simple to build a 4.2BSD kernel that will automatically
+collect profiling information as it operates simply by specifying the
+.B \-p
+option to \fIconfig\fP\|(8) when configuring a kernel.
+The program counter sampling can be driven by the system clock,
+or by an alternate real time clock.
+The latter is highly recommended as use of the system clock results
+in statistical anomalies in accounting for
+the time spent in the kernel clock routine.
+.PP
+Once a profiling system has been booted statistic gathering is
+handled by \fIkgmon\fP\|(8).
+\fIKgmon\fP allows profiling to be started and stopped
+and the internal state of the profiling buffers to be dumped.
+\fIKgmon\fP can also be used to reset the state of the internal
+buffers to allow multiple experiments to be run without
+rebooting the machine.
+The profiling data can then be processed with \fIgprof\fP\|(1)
+to obtain information regarding the system's operation.
+.PP
+A profiled system is about 5-10% larger in its text space because of
+the calls to count the subroutine invocations.
+When the system executes,
+the profiling data is stored in a buffer that is 1.2
+times the size of the text space.
+All the information is summarized in memory,
+it is not necessary to have a trace file
+being continuously dumped to disk.
+The overhead for running a profiled system varies;
+under normal load we see anywhere from 5-25%
+of the system time spent in the profiling code.
+Thus the system is noticeably slower than an unprofiled system,
+yet is not so bad that it cannot be used in a production environment.
+This is important since it allows us to gather data
+in a real environment rather than trying to
+devise synthetic work loads.
diff --git a/share/doc/papers/kerntune/3.t b/share/doc/papers/kerntune/3.t
new file mode 100644
index 000000000000..e03236b4bac6
--- /dev/null
+++ b/share/doc/papers/kerntune/3.t
@@ -0,0 +1,290 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)3.t 1.2 (Berkeley) 11/8/90
+.\"
+.ds RH Techniques for Improving Performance
+.NH 1
+Techniques for Improving Performance
+.PP
+This section gives several hints on general optimization techniques.
+It then proceeds with an example of how they can be
+applied to the 4.2BSD kernel to improve its performance.
+.NH 2
+Using the Profiler
+.PP
+The profiler is a useful tool for improving
+a set of routines that implement an abstraction.
+It can be helpful in identifying poorly coded routines,
+and in evaluating the new algorithms and code that replace them.
+Taking full advantage of the profiler
+requires a careful examination of the call graph profile,
+and a thorough knowledge of the abstractions underlying
+the kernel.
+.PP
+The easiest optimization that can be performed
+is a small change
+to a control construct or data structure.
+An obvious starting point
+is to expand a small frequently called routine inline.
+The drawback to inline expansion is that the data abstractions
+in the kernel may become less parameterized,
+hence less clearly defined.
+The profiling will also become less useful since the loss of
+routines will make its output more granular.
+.PP
+Further potential for optimization lies in routines that
+implement data abstractions whose total execution
+time is long.
+If the data abstraction function cannot easily be speeded up,
+it may be advantageous to cache its results,
+and eliminate the need to rerun
+it for identical inputs.
+These and other ideas for program improvement are discussed in
+[Bentley81].
+.PP
+This tool is best used in an iterative approach:
+profiling the kernel,
+eliminating one bottleneck,
+then finding some other part of the kernel
+that begins to dominate execution time.
+.PP
+A completely different use of the profiler is to analyze the control
+flow of an unfamiliar section of the kernel.
+By running an example that exercises the unfamiliar section of the kernel,
+and then using \fIgprof\fR, you can get a view of the
+control structure of the unfamiliar section.
+.NH 2
+An Example of Tuning
+.PP
+The first step is to come up with a method for generating
+profile data.
+We prefer to run a profiling system for about a one day
+period on one of our general timesharing machines.
+While this is not as reproducible as a synthetic workload,
+it certainly represents a realistic test.
+We have run one day profiles on several
+occasions over a three month period.
+Despite the long period of time that elapsed
+between the test runs the shape of the profiles,
+as measured by the number of times each system call
+entry point was called, were remarkably similar.
+.PP
+A second alternative is to write a small benchmark
+program to repeated exercise a suspected bottleneck.
+While these benchmarks are not useful as a long term profile
+they can give quick feedback on whether a hypothesized
+improvement is really having an effect.
+It is important to realize that the only real assurance
+that a change has a beneficial effect is through
+long term measurements of general timesharing.
+We have numerous examples where a benchmark program
+suggests vast improvements while the change
+in the long term system performance is negligible,
+and conversely examples in which the benchmark program run more slowly,
+but the long term system performance improves significantly.
+.PP
+An investigation of our long term profiling showed that
+the single most expensive function performed by the kernel
+is path name translation.
+We find that our general time sharing systems do about
+500,000 name translations per day.
+The cost of doing name translation in the original 4.2BSD
+is 24.2 milliseconds,
+representing 40% of the time processing system calls,
+which is 19% of the total cycles in the kernel,
+or 11% of all cycles executed on the machine.
+The times are shown in Figure 3.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 14.3 ms/call 11.3%
+child 9.9 ms/call 7.9%
+_
+total 24.2 ms/call 19.2%
+.TE
+.ce
+Figure 3. Call times for \fInamei\fP.
+.DE
+.KE
+.PP
+The system measurements collected showed the
+pathname translation routine, \fInamei\fP,
+was clearly worth optimizing.
+An inspection of \fInamei\fP shows that
+it consists of two nested loops.
+The outer loop is traversed once per pathname component.
+The inner loop performs a linear search through a directory looking
+for a particular pathname component.
+.PP
+Our first idea was to observe that many programs
+step through a directory performing an operation on
+each entry in turn.
+This caused us to modify \fInamei\fP to cache
+the directory offset of the last pathname
+component looked up by a process.
+The cached offset is then used
+as the point at which a search in the same directory
+begins. Changing directories invalidates the cache, as
+does modifying the directory.
+For programs that step sequentially through a directory with
+$N$ files, search time decreases from $O ( N sup 2 )$
+to $O(N)$.
+.PP
+The cost of the cache is about 20 lines of code
+(about 0.2 kilobytes)
+and 16 bytes per process, with the cached data
+stored in a process's \fIuser\fP vector.
+.PP
+As a quick benchmark to verify the effectiveness of the
+cache we ran ``ls \-l''
+on a directory containing 600 files.
+Before the per-process cache this command
+used 22.3 seconds of system time.
+After adding the cache the program used the same amount
+of user time, but the system time dropped to 3.3 seconds.
+.PP
+This change prompted our rerunning a profiled system
+on a machine containing the new \fInamei\fP.
+The results showed that the time in \fInamei\fP
+dropped by only 2.6 ms/call and
+still accounted for 36% of the system call time,
+18% of the kernel, or about 10% of all the machine cycles.
+This amounted to a drop in system time from 57% to about 55%.
+The results are shown in Figure 4.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 11.0 ms/call 9.2%
+child 10.6 ms/call 8.9%
+_
+total 21.6 ms/call 18.1%
+.TE
+.ce
+Figure 4. Call times for \fInamei\fP with per-process cache.
+.DE
+.KE
+.PP
+The small performance improvement
+was caused by a low cache hit ratio.
+Although the cache was 90% effective when hit,
+it was only usable on about 25% of the names being translated.
+An additional reason for the small improvement was that
+although the amount of time spent in \fInamei\fP itself
+decreased substantially,
+more time was spent in the routines that it called
+since each directory had to be accessed twice;
+once to search from the middle to the end,
+and once to search from the beginning to the middle.
+.PP
+Most missed names were caused by path name components
+other than the last.
+Thus Robert Elz introduced a system wide cache of most recent
+name translations.
+The cache is keyed on a name and the
+inode and device number of the directory that contains it.
+Associated with each entry is a pointer to the corresponding
+entry in the inode table.
+This has the effect of short circuiting the outer loop of \fInamei\fP.
+For each path name component,
+\fInamei\fP first looks in its cache of recent translations
+for the needed name.
+If it exists, the directory search can be completely eliminated.
+If the name is not recognized,
+then the per-process cache may still be useful in
+reducing the directory search time.
+The two cacheing schemes complement each other well.
+.PP
+The cost of the name cache is about 200 lines of code
+(about 1.2 kilobytes)
+and 44 bytes per cache entry.
+Depending on the size of the system,
+about 200 to 1000 entries will normally be configured,
+using 10-44 kilobytes of physical memory.
+The name cache is resident in memory at all times.
+.PP
+After adding the system wide name cache we reran ``ls \-l''
+on the same directory.
+The user time remained the same,
+however the system time rose slightly to 3.7 seconds.
+This was not surprising as \fInamei\fP
+now had to maintain the cache,
+but was never able to make any use of it.
+.PP
+Another profiled system was created and measurements
+were collected over a one day period. These measurements
+showed a 6 ms/call decrease in \fInamei\fP, with
+\fInamei\fP accounting for only 31% of the system call time,
+16% of the time in the kernel,
+or about 7% of all the machine cycles.
+System time dropped from 55% to about 49%.
+The results are shown in Figure 5.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 9.5 ms/call 9.6%
+child 6.1 ms/call 6.1%
+_
+total 15.6 ms/call 15.7%
+.TE
+.ce
+Figure 5. Call times for \fInamei\fP with both caches.
+.DE
+.KE
+.PP
+Statistics on the performance of both caches show
+the large performance improvement is
+caused by the high hit ratio.
+On the profiled system a 60% hit rate was observed in
+the system wide cache. This, coupled with the 25%
+hit rate in the per-process offset cache yielded an
+effective cache hit rate of 85%.
+While the system wide cache reduces both the amount of time in
+the routines that \fInamei\fP calls as well as \fInamei\fP itself
+(since fewer directories need to be accessed or searched),
+it is interesting to note that the actual percentage of system
+time spent in \fInamei\fP itself increases even though the
+actual time per call decreases.
+This is because less total time is being spent in the kernel,
+hence a smaller absolute time becomes a larger total percentage.
diff --git a/share/doc/papers/kerntune/4.t b/share/doc/papers/kerntune/4.t
new file mode 100644
index 000000000000..fcd0ad095c26
--- /dev/null
+++ b/share/doc/papers/kerntune/4.t
@@ -0,0 +1,99 @@
+.\" Copyright (c) 1984 M. K. McKusick
+.\" Copyright (c) 1984 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)4.t 1.2 (Berkeley) 11/8/90
+.\"
+.ds RH Conclusions
+.NH 1
+Conclusions
+.PP
+We have created a profiler that aids in the evaluation
+of the kernel.
+For each routine in the kernel,
+the profile shows the extent to which that routine
+helps support various abstractions,
+and how that routine uses other abstractions.
+The profile assesses the cost of routines
+at all levels of the kernel decomposition.
+The profiler is easily used,
+and can be compiled into the kernel.
+It adds only five to thirty percent execution overhead to the kernel
+being profiled,
+produces no additional output while the kernel is running
+and allows the kernel to be measured in its real environment.
+Kernel profiles can be used to identify bottlenecks in performance.
+We have shown how to improve performance
+by caching recently calculated name translations.
+The combined caches added to the name translation process
+reduce the average cost of translating a pathname to an inode by 35%.
+These changes reduce the percentage of time spent running
+in the system by nearly 9%.
+.nr H2 1
+.ds RH Acknowledgements
+.SH
+\s+2Acknowledgements\s0
+.PP
+I would like to thank Robert Elz for sharing his ideas and
+his code for cacheing system wide names.
+Thanks also to all the users at Berkeley who provided all the
+input to generate the kernel profiles.
+This work was supported by
+the Defense Advance Research Projects Agency (DoD) under
+Arpa Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.ds RH References
+.nr H2 1
+.sp 2
+.SH
+\s+2References\s-2
+.LP
+.IP [Bentley81] 20
+Bentley, J. L.,
+``Writing Efficient Code'',
+Department of Computer Science,
+Carnegie-Mellon University,
+Pittsburgh, Pennsylvania,
+CMU-CS-81-116, 1981.
+.IP [Graham82] 20
+Graham, S., Kessler, P., McKusick, M.,
+``gprof: A Call Graph Execution Profiler'',
+Proceedings of the SIGPLAN '82 Symposium on Compiler Construction,
+Volume 17, Number 6, June 1982. pp 120-126
+.IP [Graham83] 20
+Graham, S., Kessler, P., McKusick, M.,
+``An Execution Profiler for Modular Programs''
+Software - Practice and Experience,
+Volume 13, 1983. pp 671-685
+.IP [Ritchie74] 20
+Ritchie, D. M. and Thompson, K.,
+``The UNIX Time-Sharing System'',
+CACM 17, 7. July 1974. pp 365-375
diff --git a/share/doc/papers/kerntune/Makefile b/share/doc/papers/kerntune/Makefile
new file mode 100644
index 000000000000..f1d21cdd597c
--- /dev/null
+++ b/share/doc/papers/kerntune/Makefile
@@ -0,0 +1,10 @@
+# @(#)Makefile 1.5 (Berkeley) 6/8/93
+
+DIR= papers/kerntune
+SRCS= 0.t 1.t 2.t 3.t 4.t
+MACROS= -ms
+
+paper.ps: ${SRCS}
+ ${SOELIM} ${SRCS} | ${PIC} | ${TBL} | ${EQN} | ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/kerntune/fig2.pic b/share/doc/papers/kerntune/fig2.pic
new file mode 100644
index 000000000000..6731ca99f972
--- /dev/null
+++ b/share/doc/papers/kerntune/fig2.pic
@@ -0,0 +1,57 @@
+.\" Copyright (c) 1987 M. K. McKusick
+.\" Copyright (c) 1987 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig2.pic 1.2 (Berkeley) 11/8/90
+.\"
+.PS
+ellipse ht .3i wid .75i "\s-1CALLER1\s+1"
+ellipse ht .3i wid .75i "\s-1CALLER2\s+1" at 1st ellipse + (2i,0i)
+ellipse ht .3i wid .8i "\s-1EXAMPLE\s+1" at 1st ellipse + (1i,-.5i)
+ellipse ht .3i wid .5i "\s-1SUB1\s+1" at 1st ellipse - (0i,1i)
+ellipse ht .3i wid .5i "\s-1SUB2\s+1" at 3rd ellipse - (0i,.5i)
+ellipse ht .3i wid .5i "\s-1SUB3\s+1" at 2nd ellipse - (0i,1i)
+line <- from 1st ellipse up .5i left .5i chop .1875i
+line <- from 1st ellipse up .5i right .5i chop .1875i
+line <- from 2nd ellipse up .5i left .5i chop .1875i
+line <- from 2nd ellipse up .5i right .5i chop .1875i
+arrow from 1st ellipse to 3rd ellipse chop
+arrow from 2nd ellipse to 3rd ellipse chop
+arrow from 3rd ellipse to 4th ellipse chop
+arrow from 3rd ellipse to 5th ellipse chop .15i chop .15i
+arrow from 3rd ellipse to 6th ellipse chop
+arrow from 4th ellipse down .5i left .5i chop .1875i
+arrow from 4th ellipse down .5i right .5i chop .1875i
+arrow from 5th ellipse down .5i left .5i chop .1875i
+arrow from 5th ellipse down .5i right .5i chop .1875i
+arrow from 6th ellipse down .5i left .5i chop .1875i
+arrow from 6th ellipse down .5i right .5i chop .1875i
+.PE
diff --git a/share/doc/papers/memfs/0.t b/share/doc/papers/memfs/0.t
new file mode 100644
index 000000000000..d476f179752d
--- /dev/null
+++ b/share/doc/papers/memfs/0.t
@@ -0,0 +1,86 @@
+.\" Copyright (c) 1990 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)0.t 5.1 (Berkeley) 4/16/91
+.\"
+.rm CM
+.nr PO 1.25i
+.ds CH "
+.ds CF "%
+.nr Fn 0 1
+.ds b3 4.3\s-1BSD\s+1
+.de KI
+.ds Lb "Fig. \\n+(Fn
+.KF
+.ce 1
+Figure \\n(Fn - \\$1.
+..
+.de SM
+\\s-1\\$1\\s+1\\$2
+..
+.de NM
+\&\fI\\$1\fP\\$2
+..
+.de RN
+\&\fI\\$1\fP\^(\^)\\$2
+..
+.de PN
+\&\fB\\$1\fP\\$2
+..
+.TL
+A Pageable Memory Based Filesystem
+.AU
+Marshall Kirk McKusick
+.AU
+Michael J. Karels
+.AU
+Keith Bostic
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.sp
+email: mckusick@cs.Berkeley.EDU
+telephone: 415-642-4948
+.AB
+This paper describes the motivations for memory-based filesystems.
+It compares techniques used to implement them and
+describes the drawbacks of using dedicated memory to
+support such filesystems.
+To avoid the drawbacks of using dedicated memory,
+it discusses building a simple memory-based
+filesystem in pageable memory.
+It details the performance characteristics of this filesystem
+and concludes with areas for future work.
+.AE
+.LP
diff --git a/share/doc/papers/memfs/1.t b/share/doc/papers/memfs/1.t
new file mode 100644
index 000000000000..a065844390c7
--- /dev/null
+++ b/share/doc/papers/memfs/1.t
@@ -0,0 +1,392 @@
+.\" Copyright (c) 1990 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)1.t 5.1 (Berkeley) 4/16/91
+.\"
+.nr PS 11
+.nr VS 13
+.SH
+Introduction
+.PP
+This paper describes the motivation for and implementation of
+a memory-based filesystem.
+Memory-based filesystems have existed for a long time;
+they have generally been marketed as RAM disks or sometimes
+as software packages that use the machine's general purpose memory.
+.[
+white
+.]
+.PP
+A RAM disk is designed to appear like any other disk peripheral
+connected to a machine.
+It is normally interfaced to the processor through the I/O bus
+and is accessed through a device driver similar or sometimes identical
+to the device driver used for a normal magnetic disk.
+The device driver sends requests for blocks of data to the device
+and the requested data is then DMA'ed to or from the requested block.
+Instead of storing its data on a rotating magnetic disk,
+the RAM disk stores its data in a large array of random access memory
+or bubble memory.
+Thus, the latency of accessing the RAM disk is nearly zero
+compared to the 15-50 milliseconds of latency incurred when
+access rotating magnetic media.
+RAM disks also have the benefit of being able to transfer data at
+the maximum DMA rate of the system,
+while disks are typically limited by the rate that the data passes
+under the disk head.
+.PP
+Software packages simulating RAM disks operate by allocating
+a fixed partition of the system memory.
+The software then provides a device driver interface similar
+to the one described for hardware RAM disks,
+except that it uses memory-to-memory copy instead of DMA to move
+the data between the RAM disk and the system buffers,
+or it maps the contents of the RAM disk into the system buffers.
+Because the memory used by the RAM disk is not available for
+other purposes, software RAM-disk solutions are used primarily
+for machines with limited addressing capabilities such as PC's
+that do not have an effective way of using the extra memory anyway.
+.PP
+Most software RAM disks lose their contents when the system is powered
+down or rebooted.
+The contents can be saved by using battery backed-up memory,
+by storing critical filesystem data structures in the filesystem,
+and by running a consistency check program after each reboot.
+These conditions increase the hardware cost
+and potentially slow down the speed of the disk.
+Thus, RAM-disk filesystems are not typically
+designed to survive power failures;
+because of their volatility, their usefulness is limited to transient
+or easily recreated information such as might be found in
+.PN /tmp .
+Their primary benefit is that they have higher throughput
+than disk based filesystems.
+.[
+smith
+.]
+This improved throughput is particularly useful for utilities that
+make heavy use of temporary files, such as compilers.
+On fast processors, nearly half of the elapsed time for a compilation
+is spent waiting for synchronous operations required for file
+creation and deletion.
+The use of the memory-based filesystem nearly eliminates this waiting time.
+.PP
+Using dedicated memory to exclusively support a RAM disk
+is a poor use of resources.
+The overall throughput of the system can be improved
+by using the memory where it is getting the highest access rate.
+These needs may shift between supporting process virtual address spaces
+and caching frequently used disk blocks.
+If the memory is dedicated to the filesystem,
+it is better used in a buffer cache.
+The buffer cache permits faster access to the data
+because it requires only a single memory-to-memory copy
+from the kernel to the user process.
+The use of memory is used in a RAM-disk configuration may require two
+memory-to-memory copies, one from the RAM disk
+to the buffer cache,
+then another copy from the buffer cache to the user process.
+.PP
+The new work being presented in this paper is building a prototype
+RAM-disk filesystem in pageable memory instead of dedicated memory.
+The goal is to provide the speed benefits of a RAM disk
+without paying the performance penalty inherent in dedicating
+part of the physical memory on the machine to the RAM disk.
+By building the filesystem in pageable memory,
+it competes with other processes for the available memory.
+When memory runs short, the paging system pushes its
+least-recently-used pages to backing store.
+Being pageable also allows the filesystem to be much larger than
+would be practical if it were limited by the amount of physical
+memory that could be dedicated to that purpose.
+We typically operate our
+.PN /tmp
+with 30 to 60 megabytes of space
+which is larger than the amount of memory on the machine.
+This configuration allows small files to be accessed quickly,
+while still allowing
+.PN /tmp
+to be used for big files,
+although at a speed more typical of normal, disk-based filesystems.
+.PP
+An alternative to building a memory-based filesystem would be to have
+a filesystem that never did operations synchronously and never
+flushed its dirty buffers to disk.
+However, we believe that such a filesystem would either
+use a disproportionately large percentage of the buffer
+cache space, to the detriment of other filesystems,
+or would require the paging system to flush its dirty pages.
+Waiting for other filesystems to push dirty pages
+subjects them to delays while waiting for the pages to be written.
+We await the results of others trying this approach.
+.[
+Ohta
+.]
+.SH
+Implementation
+.PP
+The current implementation took less time to write than did this paper.
+It consists of 560 lines of kernel code (1.7K text + data)
+and some minor modifications to the program that builds
+disk based filesystems, \fInewfs\fP.
+A condensed version of the kernel code for the
+memory-based filesystem are reproduced in Appendix 1.
+.PP
+A filesystem is created by invoking the modified \fInewfs\fP, with
+an option telling it to create a memory-based filesystem.
+It allocates a section of virtual address space of the requested
+size and builds a filesystem in the memory
+instead of on a disk partition.
+When built, it does a \fImount\fP system call specifying a filesystem type of
+.SM MFS
+(Memory File System).
+The auxiliary data parameter to the mount call specifies a pointer
+to the base of the memory in which it has built the filesystem.
+(The auxiliary data parameter used by the local filesystem, \fIufs\fP,
+specifies the block device containing the filesystem.)
+.PP
+The mount system call allocates and initializes a mount table
+entry and then calls the filesystem-specific mount routine.
+The filesystem-specific routine is responsible for doing
+the mount and initializing the filesystem-specific
+portion of the mount table entry.
+The memory-based filesystem-specific mount routine,
+.RN mfs_mount ,
+is shown in Appendix 1.
+It allocates a block-device vnode to represent the memory disk device.
+In the private area of this vnode it stores the base address of
+the filesystem and the process identifier of the \fInewfs\fP process
+for later reference when doing I/O.
+It also initializes an I/O list that it
+uses to record outstanding I/O requests.
+It can then call the \fIufs\fP filesystem mount routine,
+passing the special block-device vnode that it has created
+instead of the usual disk block-device vnode.
+The mount proceeds just as any other local mount, except that
+requests to read from the block device are vectored through
+.RN mfs_strategy
+(described below) instead of the usual
+.RN spec_strategy
+block device I/O function.
+When the mount is completed,
+.RN mfs_mount
+does not return as most other filesystem mount functions do;
+instead it sleeps in the kernel awaiting I/O requests.
+Each time an I/O request is posted for the filesystem,
+a wakeup is issued for the corresponding \fInewfs\fP process.
+When awakened, the process checks for requests on its buffer list.
+A read request is serviced by copying data from the section of the
+\fInewfs\fP address space corresponding to the requested disk block
+to the kernel buffer.
+Similarly a write request is serviced by copying data to the section of the
+\fInewfs\fP address space corresponding to the requested disk block
+from the kernel buffer.
+When all the requests have been serviced, the \fInewfs\fP
+process returns to sleep to await more requests.
+.PP
+Once mounted,
+all operations on files in the memory-based filesystem are handled
+by the \fIufs\fP filesystem code until they get to the point where the
+filesystem needs to do I/O on the device.
+Here, the filesystem encounters the second piece of the
+memory-based filesystem.
+Instead of calling the special-device strategy routine,
+it calls the memory-based strategy routine,
+.RN mfs_strategy .
+Usually,
+the request is serviced by linking the buffer onto the
+I/O list for the memory-based filesystem
+vnode and sending a wakeup to the \fInewfs\fP process.
+This wakeup results in a context-switch to the \fInewfs\fP
+process, which does a copyin or copyout as described above.
+The strategy routine must be careful to check whether
+the I/O request is coming from the \fInewfs\fP process itself, however.
+Such requests happen during mount and unmount operations,
+when the kernel is reading and writing the superblock.
+Here,
+.RN mfs_strategy
+must do the I/O itself to avoid deadlock.
+.PP
+The final piece of kernel code to support the
+memory-based filesystem is the close routine.
+After the filesystem has been successfully unmounted,
+the device close routine is called.
+For a memory-based filesystem, the device close routine is
+.RN mfs_close .
+This routine flushes any pending I/O requests,
+then sets the I/O list head to a special value
+that is recognized by the I/O servicing loop in
+.RN mfs_mount
+as an indication that the filesystem is unmounted.
+The
+.RN mfs_mount
+routine exits, in turn causing the \fInewfs\fP process
+to exit, resulting in the filesystem vanishing in a cloud of dirty pages.
+.PP
+The paging of the filesystem does not require any additional
+code beyond that already in the kernel to support virtual memory.
+The \fInewfs\fP process competes with other processes on an equal basis
+for the machine's available memory.
+Data pages of the filesystem that have not yet been used
+are zero-fill-on-demand pages that do not occupy memory,
+although they currently allocate space in backing store.
+As long as memory is plentiful, the entire contents of the filesystem
+remain memory resident.
+When memory runs short, the oldest pages of \fInewfs\fP will be
+pushed to backing store as part of the normal paging activity.
+The pages that are pushed usually hold the contents of
+files that have been created in the memory-based filesystem
+but have not been recently accessed (or have been deleted).
+.[
+leffler
+.]
+.SH
+Performance
+.PP
+The performance of the current memory-based filesystem is determined by
+the memory-to-memory copy speed of the processor.
+Empirically we find that the throughput is about 45% of this
+memory-to-memory copy speed.
+The basic set of steps for each block written is:
+.IP 1)
+memory-to-memory copy from the user process doing the write to a kernel buffer
+.IP 2)
+context-switch to the \fInewfs\fP process
+.IP 3)
+memory-to-memory copy from the kernel buffer to the \fInewfs\fP address space
+.IP 4)
+context switch back to the writing process
+.LP
+Thus each write requires at least two memory-to-memory copies
+accounting for about 90% of the
+.SM CPU
+time.
+The remaining 10% is consumed in the context switches and
+the filesystem allocation and block location code.
+The actual context switch count is really only about half
+of the worst case outlined above because
+read-ahead and write-behind allow multiple blocks
+to be handled with each context switch.
+.PP
+On the six-\c
+.SM "MIPS CCI"
+Power 6/32 machine,
+the raw reading and writing speed is only about twice that of
+a regular disk-based filesystem.
+However, for processes that create and delete many files,
+the speedup is considerably greater.
+The reason for the speedup is that the filesystem
+must do two synchronous operations to create a file,
+first writing the allocated inode to disk, then creating the
+directory entry.
+Deleting a file similarly requires at least two synchronous
+operations.
+Here, the low latency of the memory-based filesystem is
+noticeable compared to the disk-based filesystem,
+as a synchronous operation can be done with
+just two context switches instead of incurring the disk latency.
+.SH
+Future Work
+.PP
+The most obvious shortcoming of the current implementation
+is that filesystem blocks are copied twice, once between the \fInewfs\fP
+process' address space and the kernel buffer cache,
+and once between the kernel buffer and the requesting process.
+These copies are done in different process contexts, necessitating
+two context switches per group of I/O requests.
+These problems arise because of the current inability of the kernel
+to do page-in operations
+for an address space other than that of the currently-running process,
+and the current inconvenience of mapping process-owned pages into the kernel
+buffer cache.
+Both of these problems are expected to be solved in the next version
+of the virtual memory system,
+and thus we chose not to address them in the current implementation.
+With the new version of the virtual memory system, we expect to use
+any part of physical memory as part of the buffer cache,
+even though it will not be entirely addressable at once within the kernel.
+In that system, the implementation of a memory-based filesystem
+that avoids the double copy and context switches will be much easier.
+.PP
+Ideally part of the kernel's address space would reside in pageable memory.
+Once such a facility is available it would be most efficient to
+build a memory-based filesystem within the kernel.
+One potential problem with such a scheme is that many kernels
+are limited to a small address space (usually a few megabytes).
+This restriction limits the size of memory-based
+filesystem that such a machine can support.
+On such a machine, the kernel can describe a memory-based filesystem
+that is larger than its address space and use a ``window''
+to map the larger filesystem address space into its limited address space.
+The window would maintain a cache of recently accessed pages.
+The problem with this scheme is that if the working set of
+active pages is greater than the size of the window, then
+much time is spent remapping pages and invalidating
+translation buffers.
+Alternatively, a separate address space could be constructed for each
+memory-based filesystem as in the current implementation,
+and the memory-resident pages of that address space could be mapped
+exactly as other cached pages are accessed.
+.PP
+The current system uses the existing local filesystem structures
+and code to implement the memory-based filesystem.
+The major advantages of this approach are the sharing of code
+and the simplicity of the approach.
+There are several disadvantages, however.
+One is that the size of the filesystem is fixed at mount time.
+This means that a fixed number of inodes (files) and data blocks
+can be supported.
+Currently, this approach requires enough swap space for the entire
+filesystem, and prevents expansion and contraction of the filesystem on demand.
+The current design also prevents the filesystem from taking advantage
+of the memory-resident character of the filesystem.
+It would be interesting to explore other filesystem implementations
+that would be less expensive to execute and that would make better
+use of the space.
+For example, the current filesystem structure is optimized for magnetic
+disks.
+It includes replicated control structures, ``cylinder groups''
+with separate allocation maps and control structures,
+and data structures that optimize rotational layout of files.
+None of this is useful in a memory-based filesystem (at least when the
+backing store for the filesystem is dynamically allocated and not
+contiguous on a single disk type).
+On the other hand,
+directories could be implemented using dynamically-allocated
+memory organized as linked lists or trees rather than as files stored
+in ``disk'' blocks.
+Allocation and location of pages for file data might use virtual memory
+primitives and data structures rather than direct and indirect blocks.
+A reimplementation along these lines will be considered when the virtual
+memory system in the current system has been replaced.
+.[
+$LIST$
+.]
diff --git a/share/doc/papers/memfs/A.t b/share/doc/papers/memfs/A.t
new file mode 100644
index 000000000000..c1938c8999e1
--- /dev/null
+++ b/share/doc/papers/memfs/A.t
@@ -0,0 +1,173 @@
+.\" Copyright (c) 1990 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)A.t 5.1 (Berkeley) 4/16/91
+.\"
+.bp
+.nr PS 10
+.nr VS 12
+.SH
+Appendix A - Implementation Details
+.LP
+.nf
+.vS
+/*
+ * This structure defines the control data for the memory
+ * based file system.
+ */
+struct mfsnode {
+ struct vnode *mfs_vnode; /* vnode associated with this mfsnode */
+ caddr_t mfs_baseoff; /* base of file system in memory */
+ long mfs_size; /* size of memory file system */
+ pid_t mfs_pid; /* supporting process pid */
+ struct buf *mfs_buflist; /* list of I/O requests */
+};
+
+/*
+ * Convert between mfsnode pointers and vnode pointers
+ */
+#define VTOMFS(vp) ((struct mfsnode *)(vp)->v_data)
+#define MFSTOV(mfsp) ((mfsp)->mfs_vnode)
+#define MFS_EXIT (struct buf *)-1
+
+/*
+ * Arguments to mount MFS
+ */
+struct mfs_args {
+ char *name; /* name to export for statfs */
+ caddr_t base; /* base address of file system in memory */
+ u_long size; /* size of file system */
+};
+.bp
+/*
+ * Mount an MFS filesystem.
+ */
+mfs_mount(mp, path, data)
+ struct mount *mp;
+ char *path;
+ caddr_t data;
+{
+ struct vnode *devvp;
+ struct mfsnode *mfsp;
+ struct buf *bp;
+ struct mfs_args args;
+
+ /*
+ * Create a block device to represent the disk.
+ */
+ devvp = getnewvnode(VT_MFS, VBLK, &mfs_vnodeops);
+ mfsp = VTOMFS(devvp);
+ /*
+ * Save base address of the filesystem from the supporting process.
+ */
+ copyin(data, &args, (sizeof mfs_args));
+ mfsp->mfs_baseoff = args.base;
+ mfsp->mfs_size = args.size;
+ /*
+ * Record the process identifier of the supporting process.
+ */
+ mfsp->mfs_pid = u.u_procp->p_pid;
+ /*
+ * Mount the filesystem.
+ */
+ mfsp->mfs_buflist = NULL;
+ mountfs(devvp, mp);
+ /*
+ * Loop processing I/O requests.
+ */
+ while (mfsp->mfs_buflist != MFS_EXIT) {
+ while (mfsp->mfs_buflist != NULL) {
+ bp = mfsp->mfs_buflist;
+ mfsp->mfs_buflist = bp->av_forw;
+ offset = mfsp->mfs_baseoff + (bp->b_blkno * DEV_BSIZE);
+ if (bp->b_flags & B_READ)
+ copyin(offset, bp->b_un.b_addr, bp->b_bcount);
+ else /* write_request */
+ copyout(bp->b_un.b_addr, offset, bp->b_bcount);
+ biodone(bp);
+ }
+ sleep((caddr_t)devvp, PWAIT);
+ }
+}
+.bp
+/*
+ * If the MFS process requests the I/O then we must do it directly.
+ * Otherwise put the request on the list and request the MFS process
+ * to be run.
+ */
+mfs_strategy(bp)
+ struct buf *bp;
+{
+ struct vnode *devvp;
+ struct mfsnode *mfsp;
+ off_t offset;
+
+ devvp = bp->b_vp;
+ mfsp = VTOMFS(devvp);
+ if (mfsp->mfs_pid == u.u_procp->p_pid) {
+ offset = mfsp->mfs_baseoff + (bp->b_blkno * DEV_BSIZE);
+ if (bp->b_flags & B_READ)
+ copyin(offset, bp->b_un.b_addr, bp->b_bcount);
+ else /* write_request */
+ copyout(bp->b_un.b_addr, offset, bp->b_bcount);
+ biodone(bp);
+ } else {
+ bp->av_forw = mfsp->mfs_buflist;
+ mfsp->mfs_buflist = bp;
+ wakeup((caddr_t)bp->b_vp);
+ }
+}
+
+/*
+ * The close routine is called by unmount after the filesystem
+ * has been successfully unmounted.
+ */
+mfs_close(devvp)
+ struct vnode *devvp;
+{
+ struct mfsnode *mfsp = VTOMFS(vp);
+ struct buf *bp;
+
+ /*
+ * Finish any pending I/O requests.
+ */
+ while (bp = mfsp->mfs_buflist) {
+ mfsp->mfs_buflist = bp->av_forw;
+ mfs_doio(bp, mfsp->mfs_baseoff);
+ wakeup((caddr_t)bp);
+ }
+ /*
+ * Send a request to the filesystem server to exit.
+ */
+ mfsp->mfs_buflist = MFS_EXIT;
+ wakeup((caddr_t)vp);
+}
+.vE
diff --git a/share/doc/papers/memfs/Makefile b/share/doc/papers/memfs/Makefile
new file mode 100644
index 000000000000..3e67998a051a
--- /dev/null
+++ b/share/doc/papers/memfs/Makefile
@@ -0,0 +1,22 @@
+# @(#)Makefile 1.8 (Berkeley) 6/8/93
+
+DIR= papers/memfs
+SRCS= 0.t 1.t
+MACROS= -ms
+REFER= refer -n -e -l -s -p ref.bib
+EXTRA= ref.bib A.t tmac.srefs
+CLEANFILES=ref.bib.i A.gt paper.t
+
+paper.ps: paper.t
+ ${ROFF} tmac.srefs paper.t > ${.TARGET}
+
+paper.t: ${SRCS} ref.bib.i A.gt
+ ${REFER} ${SRCS} A.gt > ${.TARGET}
+
+ref.bib.i: ref.bib
+ ${INDXBIB} ref.bib
+
+A.gt: A.t
+ ${GRIND} < A.t > A.gt
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/memfs/ref.bib b/share/doc/papers/memfs/ref.bib
new file mode 100644
index 000000000000..89ae5070dd7d
--- /dev/null
+++ b/share/doc/papers/memfs/ref.bib
@@ -0,0 +1,49 @@
+%A M. K. McKusick
+%A J. M. Bloom
+%A M. J. Karels
+%T Bug Fixes and Changes in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 12:1\-22
+%D 1986
+
+%A M. J. Karels
+%T Changes to the Kernel in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 13:1\-32
+%D 1986
+
+%A S. J. Leffler
+%A M. K. McKusick
+%A M. J. Karels
+%A J. S. Quarterman
+%T The Design and Implementation of the 4.3BSD UNIX Operating System
+%I Addison-Wesley
+%C Reading, MA
+%D 1989
+
+%A R. M. White
+%T Disk Storage Technology
+%J Scientific American
+%V 243
+%N 2
+%P 138\-148
+%D August 1980
+
+%A A. J. Smith
+%T Bibliography on file and I/O system optimizations and related topics
+%J Operating Systems Review
+%V 14
+%N 4
+%P 39\-54
+%D October 1981
+
+%A Masataka Ohta
+%A Hiroshi Tezuka
+%T A Fast /tmp File System by Async Mount Option
+%J \s-1USENIX\s0 Association Conference Proceedings
+%P ???\-???
+%D June 1990
diff --git a/share/doc/papers/memfs/spell.ok b/share/doc/papers/memfs/spell.ok
new file mode 100644
index 000000000000..7aa465fb693f
--- /dev/null
+++ b/share/doc/papers/memfs/spell.ok
@@ -0,0 +1,18 @@
+Berkeley.EDU
+Bostic
+CH
+CM
+Fn
+Karels
+Lb
+MFS
+McKusick
+Pageable
+copyin
+copyout
+email
+filesystem
+filesystems
+mckusick
+pageable
+tmp
diff --git a/share/doc/papers/memfs/tmac.srefs b/share/doc/papers/memfs/tmac.srefs
new file mode 100644
index 000000000000..62451181e6e6
--- /dev/null
+++ b/share/doc/papers/memfs/tmac.srefs
@@ -0,0 +1,177 @@
+.\" @(#)tmac.srefs 1.14 11/2/88
+.\" REFER macros .... citations
+.de []
+.][ \\$1
+..
+.de ][
+.if \\$1>5 .tm Bad arg to []
+.[\\$1
+..
+.if n .ds [. [
+.\".if t .ds [. \s-2\v'-.4m'\f1
+.if t .ds [. [
+.if n .ds .] ]
+.\".if t .ds .] \v'.4m'\s+2\fP
+.if t .ds .] ]
+.ds (. \& [
+.ds .) ]
+.if n .ds [o ""
+.if n .ds [c ""
+.if t .ds [o ``
+.if t .ds [c ''
+.ds [e \\fIet al.\\fP
+.\" for author list in reference:
+.ds &1 &
+.\" for -m signal (auth1 and auth2, year):
+.ds &2 &
+.\" the next lines deal with the problem of .[1] or [1].
+.\" refer will write "linexxx\*(<.[1]\*(>.
+.\" and either "<." or ">." should produce the .;
+.\" similarly for , and ;
+.rm <. <, <;
+.if n .ds >. .
+.if t .ds >. .
+.if n .ds >, ,
+.if t .ds >, ,
+.if n .ds >; ;
+.if t .ds >; ;
+.de [5 \" tm style
+.FS
+.IP "\\*([F.\0"
+\\*([A, \\f2\\*([T\\f1,
+.ie \\n(TN \\*([M.
+.el Bell Laboratories internal memorandum (\\*([D).
+.RT
+.FE
+..
+.de [0 \" other
+.FS
+.nr [: 0
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \{.nr [: 1
+\\*([A\c\}
+.if !"\\*([T"" \{.if \\n([:>0 ,
+.nr [: 1
+\\f2\\*([T\\f1\c\}
+.if !"\\*([O""\{.if \\n([:>0 ,
+.nr [: 1
+.if \\n([O>0 .nr [: 0
+\\*([O\c
+.if \\n([O>0 \& \c\}
+.ie !"\\*([D"" \{.if \\n([:>0 ,
+.nr [: 1
+\\*([D\c\}
+.if \\n([:>0 \&.
+.RT
+.FE
+..
+.de [1 \" journal article
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\*([o\\*([T,\\*([c
+\\f2\\*([J\\f1\c
+.if !"\\*([V"" .if n \& Vol.\&\c
+.if !"\\*([V"" \& \\f3\\*([V\\f1\c
+.if !"\\*([N"" (\\*([N)\c
+.if !"\\*([P"" \{\
+.ie \\n([P>0 , pp. \c
+.el , p. \c
+\\*([P\c\}
+.if !"\\*([I"" .if "\\*([R"" , \\*([I\c
+.if !"\\*([O"" .if \\n([O=0 , \\*([O\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" .if \\n([O>0 \\*([O
+.RT
+.FE
+..
+.de [2 \" book
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\f2\\*([T,\\f1
+\\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([G"" Gov't. ordering no. \\*([G.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de [4 \" report
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+\\*([A, \\*([o\\*([T,\\*([c
+\\*([R\c
+.if !"\\*([G"" \& (\\*([G)\c
+.if !"\\*([I"" , \\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de [3 \" article in book
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\*([o\\*([T,\\*([c
+.if !"\\*([P"" pp. \\*([P
+in \\f2\\*([B\\f1\c
+.if !"\\*([E"" , ed. \\*([E\c
+.if !"\\*([I"" , \\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de ]<
+.[<
+..
+.de [<
+.RT
+.ne 62p
+.ie \\n(rS \{\
+. rs
+. sp 4p
+.\}
+.el .sp 27p
+.Li 2 30.5P
+\fBReferences\fP
+.br
+.if \\n(Ns<2 \{\
+. nr Ns 1
+. ds ST References
+.\}
+.\"nr Tt 7
+.sp 8p
+.rm FS FE
+.\"sy echo '.T3 "\\\\t\\\\tReferences" \\n%' >>Toc
+.ns
+..
+.de [>
+.]>
+..
+.de ]>
+.sp
+..
+.de ]-
+.[-
+..
+.de [-
+.rm [V [P [A [T
+.rm [N [C [B [O
+.rm [R [I [E [D
+..
+.de ]]
+this is never
+executed
+and just
+uses up an end-of-file
+bug.
+..
diff --git a/share/doc/papers/newvm/0.t b/share/doc/papers/newvm/0.t
new file mode 100644
index 000000000000..e23a95dfc34a
--- /dev/null
+++ b/share/doc/papers/newvm/0.t
@@ -0,0 +1,86 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)0.t 5.1 (Berkeley) 4/16/91
+.\"
+.rm CM
+.TL
+A New Virtual Memory Implementation for Berkeley
+.UX
+.AU
+Marshall Kirk McKusick
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+With the cost per byte of memory approaching that of the cost per byte
+for disks, and with file systems increasingly distant from the host
+machines, a new approach to the implementation of virtual memory is
+necessary. Rather than preallocating swap space which limits the
+maximum virtual memory that can be supported to the size of the swap
+area, the system should support virtual memory up to the sum of the
+sizes of physical memory plus swap space. For systems with a local swap
+disk, but remote file systems, it may be useful to use some of the memory
+to keep track of the contents of the swap space to avoid multiple fetches
+of the same data from the file system.
+.PP
+The new implementation should also add new functionality. Processes
+should be allowed to have large sparse address spaces, to map files
+into their address spaces, to map device memory into their address
+spaces, and to share memory with other processes. The shared address
+space may either be obtained by mapping a file into (possibly
+different) parts of their address space, or by arranging to share
+``anonymous memory'' (that is, memory that is zero fill on demand, and
+whose contents are lost when the last process unmaps the memory) with
+another process as is done in System V.
+.PP
+One use of shared memory is to provide a high-speed
+Inter-Process Communication (IPC) mechanism between two or more
+cooperating processes. To insure the integrity of data structures
+in a shared region, processes must be able to use semaphores to
+coordinate their access to these shared structures. In System V,
+these semaphores are provided as a set of system calls. Unfortunately,
+the use of system calls reduces the throughput of the shared memory
+IPC to that of existing IPC mechanisms. We are proposing a scheme
+that places the semaphores in the shared memory segment, so that
+machines that have a test-and-set instruction can handle the usual
+uncontested lock and unlock without doing a system call. Only in
+the unusual case of trying to lock an already-locked lock or in
+releasing a wanted lock will a system call be required. The
+interface will allow a user-level implementation of the System V
+semaphore interface on most machines with a much lower runtime cost.
+.AE
+.LP
+.bp
diff --git a/share/doc/papers/newvm/1.t b/share/doc/papers/newvm/1.t
new file mode 100644
index 000000000000..657fc2d420e8
--- /dev/null
+++ b/share/doc/papers/newvm/1.t
@@ -0,0 +1,377 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)1.t 5.1 (Berkeley) 4/16/91
+.\"
+.NH
+Motivations for a New Virtual Memory System
+.PP
+The virtual memory system distributed with Berkeley UNIX has served
+its design goals admirably well over the ten years of its existence.
+However the relentless advance of technology has begun to render it
+obsolete.
+This section of the paper describes the current design,
+points out the current technological trends,
+and attempts to define the new design considerations that should
+be taken into account in a new virtual memory design.
+.SH
+Implementation of 4.3BSD virtual memory
+.PP
+All Berkeley Software Distributions through 4.3BSD
+have used the same virtual memory design.
+All processes, whether active or sleeping, have some amount of
+virtual address space associated with them.
+This virtual address space
+is the combination of the amount of address space with which they initially
+started plus any stack or heap expansions that they have made.
+All requests for address space are allocated from available swap space
+at the time that they are first made;
+if there is insufficient swap space left to honor the allocation,
+the system call requesting the address space fails synchronously.
+Thus, the limit to available virtual memory is established by the
+amount of swap space allocated to the system.
+.PP
+Memory pages are used in a sort of shell game to contain the
+contents of recently accessed locations.
+As a process first references a location
+a new page is allocated and filled either with initialized data or
+zeros (for new stack and break pages).
+As the supply of free pages begins to run out, dirty pages are
+pushed to the previously allocated swap space so that they can be reused
+to contain newly faulted pages.
+If a previously accessed page that has been pushed to swap is once
+again used, a free page is reallocated and filled from the swap area
+[Babaoglu79], [Someren84].
+.SH
+Design assumptions for 4.3BSD virtual memory
+.PP
+The design criteria for the current virtual memory implementation
+were made in 1979.
+At that time the cost of memory was about a thousand times greater per
+byte than magnetic disks.
+Most machines were used as centralized time sharing machines.
+These machines had far more disk storage than they had memory
+and given the cost tradeoff between memory and disk storage,
+wanted to make maximal use of the memory even at the cost of
+wasting some of the disk space or generating extra disk I/O.
+.PP
+The primary motivation for virtual memory was to allow the
+system to run individual programs whose address space exceeded
+the memory capacity of the machine.
+Thus the virtual memory capability allowed programs to be run that
+could not have been run on a swap based system.
+Equally important in the large central timesharing environment
+was the ability to allow the sum of the memory requirements of
+all active processes to exceed the amount of physical memory on
+the machine.
+The expected mode of operation for which the system was tuned
+was to have the sum of active virtual memory be one and a half
+to two times the physical memory on the machine.
+.PP
+At the time that the virtual memory system was designed,
+most machines ran with little or no networking.
+All the file systems were contained on disks that were
+directly connected to the machine.
+Similarly all the disk space devoted to swap space was also
+directly connected.
+Thus the speed and latency with which file systems could be accessed
+were roughly equivalent to the speed and latency with which swap
+space could be accessed.
+Given the high cost of memory there was little incentive to have
+the kernel keep track of the contents of the swap area once a process
+exited since it could almost as easily and quickly be reread from the
+file system.
+.SH
+New influences
+.PP
+In the ten years since the current virtual memory system was designed,
+many technological advances have occurred.
+One effect of the technological revolution is that the
+micro-processor has become powerful enough to allow users to have their
+own personal workstations.
+Thus the computing environment is moving away from a purely centralized
+time sharing model to an environment in which users have a
+computer on their desk.
+This workstation is linked through a network to a centralized
+pool of machines that provide filing, computing, and spooling services.
+The workstations tend to have a large quantity of memory,
+but little or no disk space.
+Because users do not want to be bothered with backing up their disks,
+and because of the difficulty of having a centralized administration
+backing up hundreds of small disks, these local disks are typically
+used only for temporary storage and as swap space.
+Long term storage is managed by the central file server.
+.PP
+Another major technical advance has been in all levels of storage capacity.
+In the last ten years we have experienced a factor of four decrease in the
+cost per byte of disk storage.
+In this same period of time the cost per byte of memory has dropped
+by a factor of a hundred!
+Thus the cost per byte of memory compared to the cost per byte of disk is
+approaching a difference of only about a factor of ten.
+The effect of this change is that the way in which a machine is used
+is beginning to change dramatically.
+As the amount of physical memory on machines increases and the number of
+users per machine decreases, the expected
+mode of operation is changing from that of supporting more active virtual
+memory than physical memory to that of having a surplus of memory that can
+be used for other purposes.
+.PP
+Because many machines will have more physical memory than they do swap
+space (with diskless workstations as an extreme example!),
+it is no longer reasonable to limit the maximum virtual memory
+to the amount of swap space as is done in the current design.
+Consequently, the new design will allow the maximum virtual memory
+to be the sum of physical memory plus swap space.
+For machines with no swap space, the maximum virtual memory will
+be governed by the amount of physical memory.
+.PP
+Another effect of the current technology is that the latency and overhead
+associated with accessing the file system is considerably higher
+since the access must be be over the network
+rather than to a locally-attached disk.
+One use of the surplus memory would be to
+maintain a cache of recently used files;
+repeated uses of these files would require at most a verification from
+the file server that the data was up to date.
+Under the current design, file caching is done by the buffer pool,
+while the free memory is maintained in a separate pool.
+The new design should have only a single memory pool so that any
+free memory can be used to cache recently accessed files.
+.PP
+Another portion of the memory will be used to keep track of the contents
+of the blocks on any locally-attached swap space analogously
+to the way that memory pages are handled.
+Thus inactive swap blocks can also be used to cache less-recently-used
+file data.
+Since the swap disk is locally attached, it can be much more quickly
+accessed than a remotely located file system.
+This design allows the user to simply allocate their entire local disk
+to swap space, thus allowing the system to decide what files should
+be cached to maximize its usefulness.
+This design has two major benefits.
+It relieves the user of deciding what files
+should be kept in a small local file system.
+It also insures that all modified files are migrated back to the
+file server in a timely fashion, thus eliminating the need to dump
+the local disk or push the files manually.
+.NH
+User Interface
+.PP
+This section outlines our new virtual memory interface as it is
+currently envisioned.
+The details of the system call interface are contained in Appendix A.
+.SH
+Regions
+.PP
+The virtual memory interface is designed to support both large,
+sparse address spaces as well as small, densely-used address spaces.
+In this context, ``small'' is an address space roughly the
+size of the physical memory on the machine,
+while ``large'' may extend up to the maximum addressability of the machine.
+A process may divide its address space up into a number of regions.
+Initially a process begins with four regions;
+a shared read-only fill-on-demand region with its text,
+a private fill-on-demand region for its initialized data,
+a private zero-fill-on-demand region for its uninitialized data and heap,
+and a private zero-fill-on-demand region for its stack.
+In addition to these regions, a process may allocate new ones.
+The regions may not overlap and the system may impose an alignment
+constraint, but the size of the region should not be limited
+beyond the constraints of the size of the virtual address space.
+.PP
+Each new region may be mapped either as private or shared.
+When it is privately mapped, changes to the contents of the region
+are not reflected to any other process that map the same region.
+Regions may be mapped read-only or read-write.
+As an example, a shared library would be implemented as two regions;
+a shared read-only region for the text, and a private read-write
+region for the global variables associated with the library.
+.PP
+A region may be allocated with one of several allocation strategies.
+It may map some memory hardware on the machine such as a frame buffer.
+Since the hardware is responsible for storing the data,
+such regions must be exclusive use if they are privately mapped.
+.PP
+A region can map all or part of a file.
+As the pages are first accessed, the region is filled in with the
+appropriate part of the file.
+If the region is mapped read-write and shared, changes to the
+contents of the region are reflected back into the contents of the file.
+If the region is read-write but private,
+changes to the region are copied to a private page that is not
+visible to other processes mapping the file,
+and these modified pages are not reflected back to the file.
+.PP
+The final type of region is ``anonymous memory''.
+Uninitialed data uses such a region, privately mapped;
+it is zero-fill-on-demand and its contents are abandoned
+when the last reference is dropped.
+Unlike a region that is mapped from a file,
+the contents of an anonymous region will never be read from or
+written to a disk unless memory is short and part of the region
+must be paged to a swap area.
+If one of these regions is mapped shared,
+then all processes see the changes in the region.
+This difference has important performance considerations;
+the overhead of reading, flushing, and possibly allocating a file
+is much higher than simply zeroing memory.
+.PP
+If several processes wish to share a region,
+then they must have some way of rendezvousing.
+For a mapped file this is easy;
+the name of the file is used as the rendezvous point.
+However, processes may not need the semantics of mapped files
+nor be willing to pay the overhead associated with them.
+For anonymous memory they must use some other rendezvous point.
+Our current interface allows processes to associate a
+descriptor with a region, which it may then pass to other
+processes that wish to attach to the region.
+Such a descriptor may be bound into the UNIX file system
+name space so that other processes can find it just as
+they would with a mapped file.
+.SH
+Shared memory as high speed interprocess communication
+.PP
+The primary use envisioned for shared memory is to
+provide a high speed interprocess communication (IPC) mechanism
+between cooperating processes.
+Existing IPC mechanisms (\fIi.e.\fP pipes, sockets, or streams)
+require a system call to hand off a set
+of data destined for another process, and another system call
+by the recipient process to receive the data.
+Even if the data can be transferred by remapping the data pages
+to avoid a memory to memory copy, the overhead of doing the system
+calls limits the throughput of all but the largest transfers.
+Shared memory, by contrast, allows processes to share data at any
+level of granularity without system intervention.
+.PP
+However, to maintain all but the simplest of data structures,
+the processes must serialize their modifications to shared
+data structures if they are to avoid corrupting them.
+This serialization is typically done with semaphores.
+Unfortunately, most implementations of semaphores are
+done with system calls.
+Thus processes are once again limited by the need to do two
+system calls per transaction, one to lock the semaphore, the
+second to release it.
+The net effect is that the shared memory model provides little if
+any improvement in interprocess bandwidth.
+.PP
+To achieve a significant improvement in interprocess bandwidth
+requires a large decrease in the number of system calls needed to
+achieve the interaction.
+In profiling applications that use
+serialization locks such as the UNIX kernel,
+one typically finds that most locks are not contested.
+Thus if one can find a way to avoid doing a system call in the case
+in which a lock is not contested,
+one would expect to be able to dramatically reduce the number
+of system calls needed to achieve serialization.
+.PP
+In our design, cooperating processes manage their semaphores
+in their own address space.
+In the typical case, a process executes an atomic test-and-set instruction
+to acquire a lock, finds it free, and thus is able to get it.
+Only in the (rare) case where the lock is already set does the process
+need to do a system call to wait for the lock to clear.
+When a process is finished with a lock,
+it can clear the lock itself.
+Only if the ``WANT'' flag for the lock has been set is
+it necessary for the process to do a system call to cause the other
+process(es) to be awakened.
+.PP
+Another issue that must be considered is portability.
+Some computers require access to special hardware to implement
+atomic interprocessor test-and-set.
+For such machines the setting and clearing of locks would
+all have to be done with system calls;
+applications could still use the same interface without change,
+though they would tend to run slowly.
+.PP
+The other issue of compatibility is with System V's semaphore
+implementation.
+Since the System V interface has been in existence for several years,
+and applications have been built that depend on this interface,
+it is important that this interface also be available.
+Although the interface is based on system calls for both setting and
+clearing locks,
+the same interface can be obtained using our interface without
+system calls in most cases.
+.PP
+This implementation can be achieved as follows.
+System V allows entire sets of semaphores to be set concurrently.
+If any of the locks are unavailable, the process is put to sleep
+until they all become available.
+Under our paradigm, a single additional semaphore is defined
+that serializes access to the set of semaphores being simulated.
+Once obtained in the usual way, the set of semaphores can be
+inspected to see if the desired ones are available.
+If they are available, they are set, the guardian semaphore
+is released and the process proceeds.
+If one or more of the requested set is not available,
+the guardian semaphore is released and the process selects an
+unavailable semaphores for which to wait.
+On being reawakened, the whole selection process must be repeated.
+.PP
+In all the above examples, there appears to be a race condition.
+Between the time that the process finds that a semaphore is locked,
+and the time that it manages to call the system to sleep on the
+semaphore another process may unlock the semaphore and issue a wakeup call.
+Luckily the race can be avoided.
+The insight that is critical is that the process and the kernel agree
+on the physical byte of memory that is being used for the semaphore.
+The system call to put a process to sleep takes a pointer
+to the desired semaphore as its argument so that once inside
+the kernel, the kernel can repeat the test-and-set.
+If the lock has cleared
+(and possibly the wakeup issued) between the time that the process
+did the test-and-set and eventually got into the sleep request system call,
+then the kernel immediately resumes the process rather than putting
+it to sleep.
+Thus the only problem to solve is how the kernel interlocks between testing
+a semaphore and going to sleep;
+this problem has already been solved on existing systems.
+.NH
+References
+.sp
+.IP [Babaoglu79] 20
+Babaoglu, O., and Joy, W.,
+``Data Structures Added in the Berkeley Virtual Memory Extensions
+to the UNIX Operating System''
+Computer Systems Research Group, Dept of EECS, University of California,
+Berkeley, CA 94720, USA, November 1979.
+.IP [Someren84] 20
+Someren, J. van,
+``Paging in Berkeley UNIX'',
+Laboratorium voor schakeltechniek en techneik v.d.
+informatieverwerkende machines,
+Codenummer 051560-44(1984)01, February 1984.
diff --git a/share/doc/papers/newvm/Makefile b/share/doc/papers/newvm/Makefile
new file mode 100644
index 000000000000..8def3b21f34d
--- /dev/null
+++ b/share/doc/papers/newvm/Makefile
@@ -0,0 +1,10 @@
+# @(#)Makefile 1.4 (Berkeley) 6/8/93
+
+DIR= papers/newvm
+SRCS= 0.t 1.t a.t
+MACROS= -ms
+
+paper.ps: ${SRCS}
+ ${TBL} ${SRCS} | ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/newvm/a.t b/share/doc/papers/newvm/a.t
new file mode 100644
index 000000000000..3b6213a73594
--- /dev/null
+++ b/share/doc/papers/newvm/a.t
@@ -0,0 +1,239 @@
+.\" Copyright (c) 1986 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)a.t 5.1 (Berkeley) 4/16/91
+.\"
+.sp 2
+.ne 2i
+.NH
+Appendix A \- Virtual Memory Interface
+.SH
+Mapping pages
+.PP
+The system supports sharing of data between processes
+by allowing pages to be mapped into memory. These mapped
+pages may be \fIshared\fP with other processes or \fIprivate\fP
+to the process.
+Protection and sharing options are defined in \fI<sys/mman.h>\fP as:
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* protections are chosen from these bits, or-ed together */
+#define PROT_READ 0x04 /* pages can be read */
+#define PROT_WRITE 0x02 /* pages can be written */
+#define PROT_EXEC 0x01 /* pages can be executed */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* flags contain mapping type, sharing type and options */
+/* mapping type; choose one */
+#define MAP_FILE 0x0001 /* mapped from a file or device */
+#define MAP_ANON 0x0002 /* allocated from memory, swap space */
+#define MAP_TYPE 0x000f /* mask for type field */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* sharing types; choose one */
+#define MAP_SHARED 0x0010 /* share changes */
+#define MAP_PRIVATE 0x0000 /* changes are private */
+.DE
+.DS
+.ta \w'#define\ \ 'u +\w'MAP_HASSEMAPHORE\ \ 'u +\w'0x0080\ \ 'u
+/* other flags */
+#define MAP_FIXED 0x0020 /* map addr must be exactly as requested */
+#define MAP_INHERIT 0x0040 /* region is retained after exec */
+#define MAP_HASSEMAPHORE 0x0080 /* region may contain semaphores */
+.DE
+The cpu-dependent size of a page is returned by the
+\fIgetpagesize\fP system call:
+.DS
+pagesize = getpagesize();
+result int pagesize;
+.DE
+.LP
+The call:
+.DS
+maddr = mmap(addr, len, prot, flags, fd, pos);
+result caddr_t maddr; caddr_t addr; int *len, prot, flags, fd; off_t pos;
+.DE
+causes the pages starting at \fIaddr\fP and continuing
+for at most \fIlen\fP bytes to be mapped from the object represented by
+descriptor \fIfd\fP, starting at byte offset \fIpos\fP.
+The starting address of the region is returned;
+for the convenience of the system,
+it may differ from that supplied
+unless the MAP_FIXED flag is given,
+in which case the exact address will be used or the call will fail.
+The actual amount mapped is returned in \fIlen\fP.
+The \fIaddr\fP, \fIlen\fP, and \fIpos\fP parameters
+must all be multiples of the pagesize.
+A successful \fImmap\fP will delete any previous mapping
+in the allocated address range.
+The parameter \fIprot\fP specifies the accessibility
+of the mapped pages.
+The parameter \fIflags\fP specifies
+the type of object to be mapped,
+mapping options, and
+whether modifications made to
+this mapped copy of the page
+are to be kept \fIprivate\fP, or are to be \fIshared\fP with
+other references.
+Possible types include MAP_FILE,
+mapping a regular file or character-special device memory,
+and MAP_ANON, which maps memory not associated with any specific file.
+The file descriptor used for creating MAP_ANON regions is used only
+for naming, and may be given as \-1 if no name
+is associated with the region.\(dg
+.FS
+\(dg The current design does not allow a process
+to specify the location of swap space.
+In the future we may define an additional mapping type, MAP_SWAP,
+in which the file descriptor argument specifies a file
+or device to which swapping should be done.
+.FE
+The MAP_INHERIT flag allows a region to be inherited after an \fIexec\fP.
+The MAP_HASSEMAPHORE flag allows special handling for
+regions that may contain semaphores.
+.PP
+A facility is provided to synchronize a mapped region with the file
+it maps; the call
+.DS
+msync(addr, len);
+caddr_t addr; int len;
+.DE
+writes any modified pages back to the filesystem and updates
+the file modification time.
+If \fIlen\fP is 0, all modified pages within the region containing \fIaddr\fP
+will be flushed;
+if \fIlen\fP is non-zero, only the pages containing \fIaddr\fP and \fIlen\fP
+succeeding locations will be examined.
+Any required synchronization of memory caches
+will also take place at this time.
+Filesystem operations on a file that is mapped for shared modifications
+are unpredictable except after an \fImsync\fP.
+.PP
+A mapping can be removed by the call
+.DS
+munmap(addr, len);
+caddr_t addr; int len;
+.DE
+This call deletes the mappings for the specified address range,
+and causes further references to addresses within the range
+to generate invalid memory references.
+.SH
+Page protection control
+.PP
+A process can control the protection of pages using the call
+.DS
+mprotect(addr, len, prot);
+caddr_t addr; int len, prot;
+.DE
+This call changes the specified pages to have protection \fIprot\fP\|.
+Not all implementations will guarantee protection on a page basis;
+the granularity of protection changes may be as large as an entire region.
+.SH
+Giving and getting advice
+.PP
+A process that has knowledge of its memory behavior may
+use the \fImadvise\fP call:
+.DS
+madvise(addr, len, behav);
+caddr_t addr; int len, behav;
+.DE
+\fIBehav\fP describes expected behavior, as given
+in \fI<sys/mman.h>\fP:
+.DS
+.ta \w'#define\ \ 'u +\w'MADV_SEQUENTIAL\ \ 'u +\w'00\ \ \ \ 'u
+#define MADV_NORMAL 0 /* no further special treatment */
+#define MADV_RANDOM 1 /* expect random page references */
+#define MADV_SEQUENTIAL 2 /* expect sequential references */
+#define MADV_WILLNEED 3 /* will need these pages */
+#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
+.DE
+Finally, a process may obtain information about whether pages are
+core resident by using the call
+.DS
+mincore(addr, len, vec)
+caddr_t addr; int len; result char *vec;
+.DE
+Here the current core residency of the pages is returned
+in the character array \fIvec\fP, with a value of 1 meaning
+that the page is in-core.
+.SH
+Synchronization primitives
+.PP
+Primitives are provided for synchronization using semaphores in shared memory.
+Semaphores must lie within a MAP_SHARED region with at least modes
+PROT_READ and PROT_WRITE.
+The MAP_HASSEMAPHORE flag must have been specified when the region was created.
+To acquire a lock a process calls:
+.DS
+value = mset(sem, wait)
+result int value; semaphore *sem; int wait;
+.DE
+\fIMset\fP indivisibly tests and sets the semaphore \fIsem\fP.
+If the the previous value is zero, the process has acquired the lock
+and \fImset\fP returns true immediately.
+Otherwise, if the \fIwait\fP flag is zero,
+failure is returned.
+If \fIwait\fP is true and the previous value is non-zero,
+\fImset\fP relinquishes the processor until notified that it should retry.
+.LP
+To release a lock a process calls:
+.DS
+mclear(sem)
+semaphore *sem;
+.DE
+\fIMclear\fP indivisibly tests and clears the semaphore \fIsem\fP.
+If the ``WANT'' flag is zero in the previous value,
+\fImclear\fP returns immediately.
+If the ``WANT'' flag is non-zero in the previous value,
+\fImclear\fP arranges for waiting processes to retry before returning.
+.PP
+Two routines provide services analogous to the kernel
+\fIsleep\fP and \fIwakeup\fP functions interpreted in the domain of
+shared memory.
+A process may relinquish the processor by calling \fImsleep\fP
+with a set semaphore:
+.DS
+msleep(sem)
+semaphore *sem;
+.DE
+If the semaphore is still set when it is checked by the kernel,
+the process will be put in a sleeping state
+until some other process issues an \fImwakeup\fP for the same semaphore
+within the region using the call:
+.DS
+mwakeup(sem)
+semaphore *sem;
+.DE
+An \fImwakeup\fP may awaken all sleepers on the semaphore,
+or may awaken only the next sleeper on a queue.
diff --git a/share/doc/papers/newvm/spell.ok b/share/doc/papers/newvm/spell.ok
new file mode 100644
index 000000000000..543dc7e16a8f
--- /dev/null
+++ b/share/doc/papers/newvm/spell.ok
@@ -0,0 +1,56 @@
+ANON
+Babaoglu
+Babaoglu79
+Behav
+CM
+Codenummer
+DONTNEED
+Dept
+EECS
+Filesystem
+HASSEMAPHORE
+IPC
+Karels
+Laboratorium
+MADV
+McKusick
+Mclear
+Mset
+NOEXTEND
+PROT
+SPACEAVAIL
+Someren
+Someren84
+WILLNEED
+addr
+behav
+caching
+caddr
+es
+fd
+filesystem
+getpagesize
+informatieverwerkende
+len
+maddr
+madvise
+mclear
+mincore
+mman.h
+mmap
+mprotect
+mset
+msleep
+msync
+munmap
+mwakeup
+pagesize
+pos
+prot
+runtime
+schakeltechniek
+sem
+techneik
+v.d
+vec
+voor
diff --git a/share/doc/papers/nqnfs/Makefile b/share/doc/papers/nqnfs/Makefile
new file mode 100644
index 000000000000..37530faa77f6
--- /dev/null
+++ b/share/doc/papers/nqnfs/Makefile
@@ -0,0 +1,10 @@
+# @(#)Makefile 8.1 (Berkeley) 4/20/94
+
+DIR= papers/nqnfs
+SRCS= nqnfs.me
+MACROS= -me
+
+paper.ps: ${SRCS}
+ ${PIC} ${SRCS} | ${TBL} | ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/nqnfs/nqnfs.me b/share/doc/papers/nqnfs/nqnfs.me
new file mode 100644
index 000000000000..ce9003efb2c1
--- /dev/null
+++ b/share/doc/papers/nqnfs/nqnfs.me
@@ -0,0 +1,2007 @@
+.\" Copyright (c) 1993 The Usenix Association. All rights reserved.
+.\"
+.\" This document is derived from software contributed to Berkeley by
+.\" Rick Macklem at The University of Guelph with the permission of
+.\" the Usenix Association.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)nqnfs.me 8.1 (Berkeley) 4/20/94
+.\"
+.lp
+.nr PS 12
+.ps 12
+Reprinted with permission from the "Proceedings of the Winter 1994 Usenix
+Conference", January 1994, San Francisco, CA, Copyright The Usenix
+Association.
+.nr PS 14
+.ps 14
+.sp
+.ce
+\fBNot Quite NFS, Soft Cache Consistency for NFS\fR
+.nr PS 12
+.ps 12
+.sp
+.ce
+\fIRick Macklem\fR
+.ce
+\fIUniversity of Guelph\fR
+.sp
+.nr PS 12
+.ps 12
+.ce
+\fBAbstract\fR
+.nr PS 10
+.ps 10
+.pp
+There are some constraints inherent in the NFS\(tm\(mo protocol
+that result in performance limitations
+for high performance
+workstation environments.
+This paper discusses an NFS-like protocol named Not Quite NFS (NQNFS),
+designed to address some of these limitations.
+This protocol provides full cache consistency during normal
+operation, while permitting more effective client-side caching in an
+effort to improve performance.
+There are also a variety of minor protocol changes, in order to resolve
+various NFS issues.
+The emphasis is on observed performance of a
+preliminary implementation of the protocol, in order to show
+how well this design works
+and to suggest possible areas for further improvement.
+.sh 1 "Introduction"
+.pp
+It has been observed that
+overall workstation performance has not been scaling with
+processor speed and that file system I/O is a limiting factor [Ousterhout90].
+Ousterhout
+notes
+that a principal challenge for operating system developers is the
+decoupling of system calls from their underlying I/O operations, in order
+to improve average system call response times.
+For distributed file systems, every synchronous Remote Procedure Call (RPC)
+takes a minimum of a few milliseconds and, as such, is analogous to an
+underlying I/O operation.
+This suggests that client caching with a very good
+hit ratio for read type operations, along with asynchronous writing, is required in order to avoid delays waiting for RPC replies.
+However, the NFS protocol requires that the server be stateless\**
+.(f
+\**The server must not require any state that may be lost due to a crash, to
+function correctly.
+.)f
+and does not provide any explicit mechanism for client cache
+consistency, putting
+constraints on how the client may cache data.
+This paper describes an NFS-like protocol that includes a cache consistency
+component designed to enhance client caching performance. It does provide
+full consistency under normal operation, but without requiring that hard
+state information be maintained on the server.
+Design tradeoffs were made towards simplicity and
+high performance over cache consistency under abnormal conditions.
+The protocol design uses a variation of Leases [Gray89]
+to provide state on the server that does not need to be recovered after a
+crash.
+.pp
+The protocol also includes changes designed to address other limitations
+of NFS in a modern workstation environment.
+The use of TCP transport is optionally available to avoid
+the pitfalls of Sun RPC over UDP transport when running across an internetwork [Nowicki89].
+Kerberos [Steiner88] support is available
+to do proper user authentication, in order to provide improved security and
+arbitrary client to server user ID mappings.
+There are also a variety of other changes to accommodate large file systems,
+such as 64bit file sizes and offsets, as well as lifting the 8Kbyte I/O size
+limit.
+The remainder of this paper gives an overview of the protocol, highlighting
+performance related components, followed by an evaluation of resultant performance
+for the 4.4BSD implementation.
+.sh 1 "Distributed File Systems and Caching"
+.pp
+Clients using distributed file systems cache recently-used data in order
+to reduce the number of synchronous server operations, and therefore improve
+average response times for system calls.
+Unfortunately, maintaining consistency between these caches is a problem
+whenever write sharing occurs; that is, when a process on a client writes
+to a file and one or more processes on other client(s) read the file.
+If the writer closes the file before any reader(s) open the file for reading,
+this is called sequential write sharing. Both the Andrew ITC file system
+[Howard88] and NFS [Sandberg85] maintain consistency for sequential write
+sharing by requiring the writer to push all the writes through to the
+server on close and having readers check to see if the file has been
+modified upon open. If the file has been modified, the client throws away
+all cached data for that file, as it is now stale.
+NFS implementations typically detect file modification by checking a cached
+copy of the file's modification time; since this cached value is often
+several seconds out of date and only has a resolution of one second, an NFS
+client often uses stale cached data for some time after the file has
+been updated on the server.
+.pp
+A more difficult case is concurrent write sharing, where write operations are intermixed
+with read operations.
+Consistency for this case, often referred to as "full cache consistency,"
+requires that a reader always receives the most recently written data.
+Neither NFS nor the Andrew ITC file system maintain consistency for this
+case.
+The simplest mechanism for maintaining full cache consistency is the one
+used by Sprite [Nelson88], which disables all client caching of the
+file whenever concurrent write sharing might occur.
+There are other mechanisms described in the literature [Kent87a,
+Burrows88], but they appeared to be too elaborate for incorporation
+into NQNFS (for example, Kent's requires specialized hardware).
+NQNFS differs from Sprite in the way it
+detects write sharing. The Sprite server maintains a list of files currently open
+by the various clients and detects write sharing when a file open request
+for writing is received and the file is already open for reading
+(or vice versa).
+This list of open files is hard state information that must be recovered
+after a server crash, which is a significant problem in its own
+right [Mogul93, Welch90].
+.pp
+The approach used by NQNFS is a variant of the Leases mechanism [Gray89].
+In this model, the server issues to a client a promise, referred to as a
+"lease," that the client may cache a specific object without fear of
+conflict.
+A lease has a limited duration and must be renewed by the client if it
+wishes to continue to cache the object.
+In NQNFS, clients hold short-term (up to one minute) leases on files
+for reading or writing.
+The leases are analogous to entries in the open file list, except that
+they expire after the lease term unless renewed by the client.
+As such, one minute after issuing the last lease there are no current
+leases and therefore no lease records to be recovered after a crash, hence
+the term "soft server state."
+.pp
+A related design consideration is the way client writing is done.
+Synchronous writing requires that all writes be pushed through to the server
+during the write system call.
+This is the simplest variant, from a consistency point of view, since the
+server always has the most recently written data. It also permits any write
+errors, such as "file system out of space" to be propagated back to the
+client's process via the write system call return.
+Unfortunately this approach limits the client write rate, based on server write
+performance and client/server RPC round trip time (RTT).
+.pp
+An alternative to this is delayed writing, where the write system call returns
+as soon as the data is cached on the client and the data is written to the
+server sometime later.
+This permits client writing to occur at the rate of local storage access
+up to the size of the local cache.
+Also, for cases where file truncation/deletion occurs shortly after writing,
+the write to the server may be avoided since the data has already been
+deleted, reducing server write load.
+There are some obvious drawbacks to this approach.
+For any Sprite-like system to maintain
+full consistency, the server must "callback" to the client to cause the
+delayed writes to be written back to the server when write sharing is about to
+occur.
+There are also problems with the propagation of errors
+back to the client process that issued the write system call.
+The reason for this is that
+the system call has already returned without reporting an error and the
+process may also have already terminated.
+As well, there is a risk of the loss of recently written data if the client
+crashes before the data is written back to the server.
+.pp
+A compromise between these two alternatives is asynchronous writing, where
+the write to the server is initiated during the write system call but the write system
+call returns before the write completes.
+This approach minimizes the risk of data loss due to a client crash, but negates
+the possibility of reducing server write load by throwing writes away when
+a file is truncated or deleted.
+.pp
+NFS implementations usually do a mix of asynchronous and delayed writing
+but push all writes to the server upon close, in order to maintain open/close
+consistency.
+Pushing the delayed writes on close
+negates much of the performance advantage of delayed writing, since the
+delays that were avoided in the write system calls are observed in the close
+system call.
+Akin to Sprite, the NQNFS protocol does delayed writing in an effort to achieve
+good client performance and uses a callback mechanism to maintain full cache
+consistency.
+.sh 1 "Related Work"
+.pp
+There has been a great deal of effort put into improving the performance and
+consistency of the NFS protocol. This work can be put in two categories.
+The first category are implementation enhancements for the NFS protocol and
+the second involve modifications to the protocol.
+.pp
+The work done on implementation enhancements have attacked two problem areas,
+NFS server write performance and RPC transport problems.
+Server write performance is a major problem for NFS, in part due to the
+requirement to push all writes to the server upon close and in part due
+to the fact that, for writes, all data and meta-data must be committed to
+non-volatile storage before the server replies to the write RPC.
+The Prestoserve\(tm\(dg
+[Moran90]
+system uses non-volatile RAM as a buffer for recently written data on the server,
+so that the write RPC replies can be returned to the client before the data is written to the
+disk surface.
+Write gathering [Juszczak94] is a software technique used on the server where a write
+RPC request is delayed for a short time in the hope that another contiguous
+write request will arrive, so that they can be merged into one write operation.
+Since the replies to all of the merged writes are not returned to the client until the write
+operation is completed, this delay does not violate the protocol.
+When write operations are merged, the number of disk writes can be reduced,
+improving server write performance.
+Although either of the above reduces write RPC response time for the server,
+it cannot be reduced to zero, and so, any client side caching mechanism
+that reduces write RPC load or client dependence on server RPC response time
+should still improve overall performance.
+Good client side caching should be complementary to these server techniques,
+although client performance improvements as a result of caching may be less
+dramatic when these techniques are used.
+.pp
+In NFS, each Sun RPC request is packaged in a UDP datagram for transmission
+to the server. A timer is started, and if a timeout occurs before the corresponding
+RPC reply is received, the RPC request is retransmitted.
+There are two problems with this model.
+First, when a retransmit timeout occurs, the RPC may be redone, instead of
+simply retransmitting the RPC request message to the server. A recent-request
+cache can be used on the server to minimize the negative impact of redoing
+RPCs [Juszczak89].
+The second problem is that a large UDP datagram, such as a read request or
+write reply, must be fragmented by IP and if any one IP fragment is lost in
+transit, the entire UDP datagram is lost [Kent87]. Since entire requests and replies
+are packaged in a single UDP datagram, this puts an upper bound on the read/write
+data size (8 kbytes).
+.pp
+Adjusting the retransmit timeout (RTT) interval dynamically and applying a
+congestion window on outstanding requests has been shown to be of some help
+[Nowicki89] with the retransmission problem.
+An alternative to this is to use TCP transport to delivery the RPC messages
+reliably [Macklem90] and one of the performance results in this paper
+shows the effects of this further.
+.pp
+Srinivasan and Mogul [Srinivasan89] enhanced the NFS protocol to use the Sprite cache
+consistency algorithm in an effort to improve performance and to provide
+full client cache consistency.
+This experimental implementation demonstrated significantly better
+performance than NFS, but suffered from a lack of crash recovery support.
+The NQNFS protocol design borrowed heavily from this work, but differed
+from the Sprite algorithm by using Leases instead of file open state
+to detect write sharing.
+The decision to use Leases was made primarily to avoid the crash recovery
+problem.
+More recent work by the Sprite group [Baker91] and Mogul [Mogul93] have
+addressed the crash recovery problem, making this design tradeoff more
+questionable now.
+.pp
+Sun has recently updated the NFS protocol to Version 3 [SUN93], using some
+changes similar to NQNFS to address various issues. The Version 3 protocol
+uses 64bit file sizes and offsets, provides a Readdir_and_Lookup RPC and
+an access RPC.
+It also provides cache hints, to permit a client to be able to determine
+whether a file modification is the result of that client's write or some
+other client's write.
+It would be possible to add either Spritely NFS or NQNFS support for cache
+consistency to the NFS Version 3 protocol.
+.sh 1 "NQNFS Consistency Protocol and Recovery"
+.pp
+The NQNFS cache consistency protocol uses a somewhat Sprite-like [Nelson88]
+mechanism, but is based on Leases [Gray89] instead of hard server state information
+about open files.
+The basic principle is that the server disables client caching of files whenever
+concurrent write sharing could occur, by performing a server-to-client
+callback,
+forcing the client to flush its caches and to do all subsequent I/O on the file with
+synchronous RPCs.
+A Sprite server maintains a record of the open state of files for
+all clients and uses this to determine when concurrent write sharing might
+occur.
+This \fIopen state\fR information might also be referred to as an infinite-term
+lease for the file, with explicit lease cancellation.
+NQNFS, on the other hand, uses a short-term lease that expires due to timeout
+after a maximum of one minute, unless explicitly renewed by the client.
+The fundamental difference is that an NQNFS client must keep renewing
+a lease to use cached data whereas a Sprite client assumes the data is valid until canceled
+by the server
+or the file is closed.
+Using leases permits the server to remain "stateless," since the soft
+state information, which consists of the set of current leases, is
+moot after one minute, when all the leases expire.
+.pp
+Whenever a client wishes to access a file's data it must hold one of
+three types of lease: read-caching, write-caching or non-caching.
+The latter type requires that all file operations be done synchronously with
+the server via the appropriate RPCs.
+.pp
+A read-caching lease allows for client data caching but no modifications
+may be done.
+It may, however, be shared between multiple clients. Diagram 1 shows a typical
+read-caching scenario. The vertical solid black lines depict the lease records.
+Note that the time lines are nowhere near to scale, since a client/server
+interaction will normally take less than one hundred milliseconds, whereas the
+normal lease duration is thirty seconds.
+Every lease includes a \fImodrev\fR value, which changes upon every modification
+of the file. It may be used to check to see if data cached on the client is
+still current.
+.pp
+A write-caching lease permits delayed write caching,
+but requires that all data be pushed to the server when the lease expires
+or is terminated by an eviction callback.
+When a write-caching lease has almost expired, the client will attempt to
+extend the lease if the file is still open, but is required to push the delayed writes to the server
+if renewal fails (as depicted by diagram 2).
+The writes may not arrive at the server until after the write lease has
+expired on the client, but this does not result in a consistency problem,
+so long as the write lease is still valid on the server.
+Note that, in diagram 2, the lease record on the server remains current after
+the expiry time, due to the conditions mentioned in section 5.
+If a write RPC is done on the server after the write lease has expired on
+the server, this could be considered an error since consistency could be
+lost, but it is not handled as such by NQNFS.
+.pp
+Diagram 3 depicts how read and write leases are replaced by a non-caching
+lease when there is the potential for write sharing.
+.(z
+.sp
+.PS
+.ps
+.ps 50
+line from 0.738,5.388 to 1.238,5.388
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 1.488,10.075 to 1.488,5.450
+line dashed from 2.987,10.075 to 2.987,5.450
+line dashed from 4.487,10.075 to 4.487,5.450
+.ps
+.ps 50
+line from 4.487,7.013 to 4.487,5.950
+line from 2.987,7.700 to 2.987,5.950 to 2.987,6.075
+line from 1.488,7.513 to 1.488,5.950
+line from 2.987,9.700 to 2.987,8.325
+line from 1.488,9.450 to 1.488,8.325
+.ps
+.ps 10
+line from 2.987,6.450 to 4.487,6.200
+line from 4.385,6.192 to 4.487,6.200 to 4.393,6.241
+line from 4.487,6.888 to 2.987,6.575
+line from 3.080,6.620 to 2.987,6.575 to 3.090,6.571
+line from 2.987,7.263 to 4.487,7.013
+line from 4.385,7.004 to 4.487,7.013 to 4.393,7.054
+line from 4.487,7.638 to 2.987,7.388
+line from 3.082,7.429 to 2.987,7.388 to 3.090,7.379
+line from 2.987,6.888 to 1.488,6.575
+line from 1.580,6.620 to 1.488,6.575 to 1.590,6.571
+line from 1.488,7.200 to 2.987,6.950
+line from 2.885,6.942 to 2.987,6.950 to 2.893,6.991
+line from 2.987,7.700 to 1.488,7.513
+line from 1.584,7.550 to 1.488,7.513 to 1.590,7.500
+line from 1.488,8.012 to 2.987,7.763
+line from 2.885,7.754 to 2.987,7.763 to 2.893,7.804
+line from 2.987,9.012 to 1.488,8.825
+line from 1.584,8.862 to 1.488,8.825 to 1.590,8.813
+line from 1.488,9.325 to 2.987,9.137
+line from 2.885,9.125 to 2.987,9.137 to 2.891,9.175
+line from 2.987,9.637 to 1.488,9.450
+line from 1.584,9.487 to 1.488,9.450 to 1.590,9.438
+line from 1.488,9.887 to 2.987,9.700
+line from 2.885,9.688 to 2.987,9.700 to 2.891,9.737
+.ps
+.ps 12
+.ft
+.ft R
+"Lease valid on machine" at 1.363,5.296 ljust
+"with same modrev" at 1.675,7.421 ljust
+"miss)" at 2.612,9.233 ljust
+"(cache" at 2.300,9.358 ljust
+.ps
+.ps 14
+"Diagram #1: Read Caching Leases" at 0.738,5.114 ljust
+"Client B" at 4.112,10.176 ljust
+"Server" at 2.612,10.176 ljust
+"Client A" at 0.925,10.176 ljust
+.ps
+.ps 12
+"from cache" at 4.675,6.546 ljust
+"Read syscalls" at 4.675,6.796 ljust
+"Reply" at 3.737,6.108 ljust
+"(cache miss)" at 3.675,6.421 ljust
+"Read req" at 3.737,6.608 ljust
+"to lease" at 3.112,6.796 ljust
+"Client B added" at 3.112,6.983 ljust
+"Reply" at 3.237,7.296 ljust
+"Read + lease req" at 3.175,7.671 ljust
+"Read syscall" at 4.675,7.608 ljust
+"Reply" at 1.675,6.796 ljust
+"miss)" at 2.487,7.108 ljust
+"Read req (cache" at 1.675,7.233 ljust
+"from cache" at 0.425,6.296 ljust
+"Read syscalls" at 0.425,6.546 ljust
+"cache" at 0.425,6.858 ljust
+"so can still" at 0.425,7.108 ljust
+"Modrev same" at 0.425,7.358 ljust
+"Reply" at 1.675,7.671 ljust
+"Get lease req" at 1.675,8.108 ljust
+"Read syscall" at 0.425,7.983 ljust
+"Lease times out" at 0.425,8.296 ljust
+"from cache" at 0.425,9.046 ljust
+"Read syscalls" at 0.425,9.296 ljust
+"for Client A" at 3.112,9.296 ljust
+"Read caching lease" at 3.112,9.483 ljust
+"Reply" at 1.675,8.983 ljust
+"Read req" at 1.675,9.358 ljust
+"Reply" at 1.675,9.608 ljust
+"Read + lease req" at 1.675,9.921 ljust
+"Read syscall" at 0.425,9.921 ljust
+.ps
+.ft
+.PE
+.sp
+.)z
+.(z
+.sp
+.PS
+.ps
+.ps 50
+line from 1.175,5.700 to 1.300,5.700
+line from 0.738,5.700 to 1.175,5.700
+line from 2.987,6.638 to 2.987,6.075
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 2.987,6.575 to 2.987,5.950
+line dashed from 1.488,6.575 to 1.488,5.888
+.ps
+.ps 50
+line from 2.987,9.762 to 2.987,6.638
+line from 1.488,9.450 to 1.488,7.700
+.ps
+.ps 10
+line from 2.987,6.763 to 1.488,6.575
+line from 1.584,6.612 to 1.488,6.575 to 1.590,6.563
+line from 1.488,7.013 to 2.987,6.825
+line from 2.885,6.813 to 2.987,6.825 to 2.891,6.862
+line from 2.987,7.325 to 1.488,7.075
+line from 1.582,7.116 to 1.488,7.075 to 1.590,7.067
+line from 1.488,7.700 to 2.987,7.388
+line from 2.885,7.383 to 2.987,7.388 to 2.895,7.432
+line from 2.987,8.575 to 1.488,8.325
+line from 1.582,8.366 to 1.488,8.325 to 1.590,8.317
+line from 1.488,8.887 to 2.987,8.637
+line from 2.885,8.629 to 2.987,8.637 to 2.893,8.679
+line from 2.987,9.637 to 1.488,9.450
+line from 1.584,9.487 to 1.488,9.450 to 1.590,9.438
+line from 1.488,9.887 to 2.987,9.762
+line from 2.886,9.746 to 2.987,9.762 to 2.890,9.796
+line dashed from 2.987,10.012 to 2.987,6.513
+line dashed from 1.488,10.012 to 1.488,6.513
+.ps
+.ps 12
+.ft
+.ft R
+"write" at 4.237,5.921 ljust
+"Lease valid on machine" at 1.425,5.733 ljust
+.ps
+.ps 14
+"Diagram #2: Write Caching Lease" at 0.738,5.551 ljust
+"Server" at 2.675,10.114 ljust
+"Client A" at 1.113,10.114 ljust
+.ps
+.ps 12
+"seconds after last" at 3.112,5.921 ljust
+"Expires write_slack" at 3.112,6.108 ljust
+"due to write activity" at 3.112,6.608 ljust
+"Expiry delayed" at 3.112,6.796 ljust
+"Lease times out" at 3.112,7.233 ljust
+"Lease renewed" at 3.175,8.546 ljust
+"Lease for client A" at 3.175,9.358 ljust
+"Write caching" at 3.175,9.608 ljust
+"Reply" at 1.675,6.733 ljust
+"Write req" at 1.988,7.046 ljust
+"Reply" at 1.675,7.233 ljust
+"Write req" at 1.675,7.796 ljust
+"Lease expires" at 0.487,7.733 ljust
+"Close syscall" at 0.487,8.108 ljust
+"lease granted" at 1.675,8.546 ljust
+"Get write lease" at 1.675,8.921 ljust
+"before expiry" at 0.487,8.608 ljust
+"Lease renewal" at 0.487,8.796 ljust
+"syscalls" at 0.487,9.046 ljust
+"Delayed write" at 0.487,9.233 ljust
+"lease granted" at 1.675,9.608 ljust
+"Get write lease req" at 1.675,9.921 ljust
+"Write syscall" at 0.487,9.858 ljust
+.ps
+.ft
+.PE
+.sp
+.)z
+.(z
+.sp
+.PS
+.ps
+.ps 50
+line from 0.613,2.638 to 1.238,2.638
+line from 1.488,4.075 to 1.488,3.638
+line from 2.987,4.013 to 2.987,3.575
+line from 4.487,4.013 to 4.487,3.575
+.ps
+.ps 10
+line from 2.987,3.888 to 4.487,3.700
+line from 4.385,3.688 to 4.487,3.700 to 4.391,3.737
+line from 4.487,4.138 to 2.987,3.950
+line from 3.084,3.987 to 2.987,3.950 to 3.090,3.938
+line from 2.987,4.763 to 4.487,4.450
+line from 4.385,4.446 to 4.487,4.450 to 4.395,4.495
+.ps
+.ps 50
+line from 4.487,4.438 to 4.487,4.013
+.ps
+.ps 10
+line from 4.487,5.138 to 2.987,4.888
+line from 3.082,4.929 to 2.987,4.888 to 3.090,4.879
+.ps
+.ps 50
+line from 4.487,6.513 to 4.487,5.513
+line from 4.487,6.513 to 4.487,6.513 to 4.487,5.513
+line from 2.987,5.450 to 2.987,5.200
+line from 1.488,5.075 to 1.488,4.075
+line from 2.987,5.263 to 2.987,4.013
+line from 2.987,7.700 to 2.987,5.325
+line from 4.487,7.575 to 4.487,6.513
+line from 1.488,8.512 to 1.488,8.075
+line from 2.987,8.637 to 2.987,8.075
+line from 2.987,9.637 to 2.987,8.825
+line from 1.488,9.450 to 1.488,8.950
+.ps
+.ps 10
+line from 2.987,4.450 to 1.488,4.263
+line from 1.584,4.300 to 1.488,4.263 to 1.590,4.250
+line from 1.488,4.888 to 2.987,4.575
+line from 2.885,4.571 to 2.987,4.575 to 2.895,4.620
+line from 2.987,5.263 to 1.488,5.075
+line from 1.584,5.112 to 1.488,5.075 to 1.590,5.063
+line from 4.487,5.513 to 2.987,5.325
+line from 3.084,5.362 to 2.987,5.325 to 3.090,5.313
+line from 2.987,5.700 to 4.487,5.575
+line from 4.386,5.558 to 4.487,5.575 to 4.390,5.608
+line from 4.487,6.013 to 2.987,5.825
+line from 3.084,5.862 to 2.987,5.825 to 3.090,5.813
+line from 2.987,6.200 to 4.487,6.075
+line from 4.386,6.058 to 4.487,6.075 to 4.390,6.108
+line from 4.487,6.450 to 2.987,6.263
+line from 3.084,6.300 to 2.987,6.263 to 3.090,6.250
+line from 2.987,6.700 to 4.487,6.513
+line from 4.385,6.500 to 4.487,6.513 to 4.391,6.550
+line from 1.488,6.950 to 2.987,6.763
+line from 2.885,6.750 to 2.987,6.763 to 2.891,6.800
+line from 2.987,7.700 to 4.487,7.575
+line from 4.386,7.558 to 4.487,7.575 to 4.390,7.608
+line from 4.487,7.950 to 2.987,7.763
+line from 3.084,7.800 to 2.987,7.763 to 3.090,7.750
+line from 2.987,8.637 to 1.488,8.512
+line from 1.585,8.546 to 1.488,8.512 to 1.589,8.496
+line from 1.488,8.887 to 2.987,8.700
+line from 2.885,8.688 to 2.987,8.700 to 2.891,8.737
+line from 2.987,9.637 to 1.488,9.450
+line from 1.584,9.487 to 1.488,9.450 to 1.590,9.438
+line from 1.488,9.950 to 2.987,9.762
+line from 2.885,9.750 to 2.987,9.762 to 2.891,9.800
+dashwid = 0.050i
+line dashed from 4.487,10.137 to 4.487,2.825
+line dashed from 2.987,10.137 to 2.987,2.825
+line dashed from 1.488,10.137 to 1.488,2.825
+.ps
+.ps 12
+.ft
+.ft R
+"(not cached)" at 4.612,3.858 ljust
+.ps
+.ps 14
+"Diagram #3: Write sharing case" at 0.613,2.239 ljust
+.ps
+.ps 12
+"Write syscall" at 4.675,7.546 ljust
+"Read syscall" at 0.550,9.921 ljust
+.ps
+.ps 14
+"Lease valid on machine" at 1.363,2.551 ljust
+.ps
+.ps 12
+"(can still cache)" at 1.675,8.171 ljust
+"Reply" at 3.800,3.858 ljust
+"Write" at 3.175,4.046 ljust
+"writes" at 4.612,4.046 ljust
+"synchronous" at 4.612,4.233 ljust
+"write syscall" at 4.675,5.108 ljust
+"non-caching lease" at 3.175,4.296 ljust
+"Reply " at 3.175,4.483 ljust
+"req" at 3.175,4.983 ljust
+"Get write lease" at 3.175,5.108 ljust
+"Vacated msg" at 3.175,5.483 ljust
+"to the server" at 4.675,5.858 ljust
+"being flushed to" at 4.675,6.046 ljust
+"Delayed writes" at 4.675,6.233 ljust
+.ps
+.ps 16
+"Server" at 2.675,10.182 ljust
+"Client B" at 3.925,10.182 ljust
+"Client A" at 0.863,10.182 ljust
+.ps
+.ps 12
+"(not cached)" at 0.550,4.733 ljust
+"Read data" at 0.550,4.921 ljust
+"Reply data" at 1.675,4.421 ljust
+"Read request" at 1.675,4.921 ljust
+"lease" at 1.675,5.233 ljust
+"Reply non-caching" at 1.675,5.421 ljust
+"Reply" at 3.737,5.733 ljust
+"Write" at 3.175,5.983 ljust
+"Reply" at 3.737,6.171 ljust
+"Write" at 3.175,6.421 ljust
+"Eviction Notice" at 3.175,6.796 ljust
+"Get read lease" at 1.675,7.046 ljust
+"Read syscall" at 0.550,6.983 ljust
+"being cached" at 4.675,7.171 ljust
+"Delayed writes" at 4.675,7.358 ljust
+"lease" at 3.175,7.233 ljust
+"Reply write caching" at 3.175,7.421 ljust
+"Get write lease" at 3.175,7.983 ljust
+"Write syscall" at 4.675,7.983 ljust
+"with same modrev" at 1.675,8.358 ljust
+"Lease" at 0.550,8.171 ljust
+"Renewed" at 0.550,8.358 ljust
+"Reply" at 1.675,8.608 ljust
+"Get Lease Request" at 1.675,8.983 ljust
+"Read syscall" at 0.550,8.733 ljust
+"from cache" at 0.550,9.108 ljust
+"Read syscall" at 0.550,9.296 ljust
+"Reply " at 1.675,9.671 ljust
+"plus lease" at 2.050,9.983 ljust
+"Read Request" at 1.675,10.108 ljust
+.ps
+.ft
+.PE
+.sp
+.)z
+A write-caching lease is not used in the Stanford V Distributed System [Gray89],
+since synchronous writing is always used. A side effect of this change
+is that the five to ten second lease duration recommended by Gray was found
+to be insufficient to achieve good performance for the write-caching lease.
+Experimentation showed that thirty seconds was about optimal for cases where
+the client and server are connected to the same local area network, so
+thirty seconds is the default lease duration for NQNFS.
+A maximum of twice that value is permitted, since Gray showed that for some
+network topologies, a larger lease duration functions better.
+Although there is an explicit get_lease RPC defined for the protocol,
+most lease requests are piggybacked onto the other RPCs to minimize the
+additional overhead introduced by leasing.
+.sh 2 "Rationale"
+.pp
+Leasing was chosen over hard server state information for the following
+reasons:
+.ip 1.
+The server must maintain state information about all current
+client leases.
+Since at most one lease is allocated for each RPC and the leases expire
+after their lease term,
+the upper bound on the number of current leases is the product of the
+lease term and the server RPC rate.
+In practice, it has been observed that less than 10% of RPCs request new leases
+and since most leases have a term of thirty seconds, the following rule of
+thumb should estimate the number of server lease records:
+.sp
+.nf
+ Number of Server Lease Records \(eq 0.1 * 30 * RPC rate
+.fi
+.sp
+Since each lease record occupies 64 bytes of server memory, storing the lease
+records should not be a serious problem.
+If a server has exhausted lease storage, it can simply wait a few seconds
+for a lease to expire and free up a record.
+On the other hand, a Sprite-like server must store records for all files
+currently open by all clients, which can require significant storage for
+a large, heavily loaded server.
+In [Mogul93], it is proposed that a mechanism vaguely similar to paging could be
+used to deal with this for Spritely NFS, but this
+appears to introduce a fair amount of complexity and may limit the
+usefulness of open records for storing other state information, such
+as file locks.
+.ip 2.
+After a server crashes it must recover lease records for
+the current outstanding leases, which actually implies that if it waits
+until all leases have expired, there is no state to recover.
+The server must wait for the maximum lease duration of one minute, and it must serve
+all outstanding write requests resulting from terminated write-caching
+leases before issuing new leases. The one minute delay can be overlapped with
+file system consistency checking (eg. fsck).
+Because no state must be recovered, a lease-based server, like an NFS server,
+avoids the problem of state recovery after a crash.
+.sp
+There can, however, be problems during crash recovery
+because of a potentially large number of write backs due to terminated
+write-caching leases.
+One of these problems is a "recovery storm" [Baker91], which could occur when
+the server is overloaded by the number of write RPC requests.
+The NQNFS protocol deals with this by replying
+with a return status code called
+try_again_later to all
+RPC requests (except write) until the write requests subside.
+At this time, there has not been sufficient testing of server crash
+recovery while under heavy server load to determine if the try_again_later
+reply is a sufficient solution to the problem.
+The other problem is that consistency will be lost if other RPCs are performed
+before all of the write backs for terminated write-caching leases have completed.
+This is handled by only performing write RPCs until
+no write RPC requests arrive
+for write_slack seconds, where write_slack is set to several times
+the client timeout retransmit interval,
+at which time it is assumed all clients have had an opportunity to send their writes
+to the server.
+.ip 3.
+Another advantage of leasing is that, since leases are required at times when other I/O operations occur,
+lease requests can almost always be piggybacked on other RPCs, avoiding some of the
+overhead associated with the explicit open and close RPCs required by a Sprite-like system.
+Compared with Sprite cache consistency,
+this can result in a significantly lower RPC load (see table #1).
+.sh 1 "Limitations of the NQNFS Protocol"
+.pp
+There is a serious risk when leasing is used for delayed write
+caching.
+If the server is simply too busy to service a lease renewal before a write-caching
+lease terminates, the client will not be able to push the write
+data to the server before the lease has terminated, resulting in
+inconsistency.
+Note that the danger of inconsistency occurs when the server assumes that
+a write-caching lease has terminated before the client has
+had the opportunity to write the data back to the server.
+In an effort to avoid this problem, the NQNFS server does not assume that
+a write-caching lease has terminated until three conditions are met:
+.sp
+.(l
+1 - clock time > (expiry time + clock skew)
+2 - there is at least one server daemon (nfsd) waiting for an RPC request
+3 - no write RPCs received for leased file within write_slack after the corrected expiry time
+.)l
+.lp
+The first condition ensures that the lease has expired on the client.
+The clock_skew, by default three seconds, must be
+set to a value larger than the maximum time-of-day clock error that is likely to occur
+during the maximum lease duration.
+The second condition attempts to ensure that the client
+is not waiting for replies to any writes that are still queued for service by
+an nfsd. The third condition tries to guarantee that the client has
+transmitted all write requests to the server, since write_slack is set to
+several times the client's timeout retransmit interval.
+.pp
+There are also certain file system semantics that are problematic for both NFS and NQNFS,
+due to the
+lack of state information maintained by the
+server. If a file is unlinked on one client while open on another it will
+be removed from the file server, resulting in failed file accesses on the
+client that has the file open.
+If the file system on the server is out of space or the client user's disk
+quota has been exceeded, a delayed write can fail long after the write system
+call was successfully completed.
+With NFS this error will be detected by the close system call, since
+the delayed writes are pushed upon close. With NQNFS however, the delayed write
+RPC may not occur until after the close system call, possibly even after the process
+has exited.
+Therefore,
+if a process must check for write errors,
+a system call such as \fIfsync\fR must be used.
+.pp
+Another problem occurs when a process on one client is
+running an executable file
+and a process on another client starts to write to the file. The read lease on
+the first client is terminated by the server, but the client has no recourse but
+to terminate the process, since the process is already in progress on the old
+executable.
+.pp
+The NQNFS protocol does not support file locking, since a file lock would have
+to involve hard, recovered after a crash, state information.
+.sh 1 "Other NQNFS Protocol Features"
+.pp
+NQNFS also includes a variety of minor modifications to the NFS protocol, in an
+attempt to address various limitations.
+The protocol uses 64bit file sizes and offsets in order to handle large files.
+TCP transport may be used as an alternative to UDP
+for cases where UDP does not perform well.
+Transport mechanisms
+such as TCP also permit the use of much larger read/write data sizes,
+which might improve performance in certain environments.
+.pp
+The NQNFS protocol replaces the Readdir RPC with a Readdir_and_Lookup
+RPC that returns the file handle and attributes for each file in the
+directory as well as name and file id number.
+This additional information may then be loaded into the lookup and file-attribute
+caches on the client.
+Thus, for cases such as "ls -l", the \fIstat\fR system calls can be performed
+locally without doing any lookup or getattr RPCs.
+Another additional RPC is the Access RPC that checks for file
+accessibility against the server. This is necessary since in some cases the
+client user ID is mapped to a different user on the server and doing the
+access check locally on the client using file attributes and client credentials is
+not correct.
+One case where this becomes necessary is when the NQNFS mount point is using
+Kerberos authentication, where the Kerberos authentication ticket is translated
+to credentials on the server that are mapped to the client side user id.
+For further details on the protocol, see [Macklem93].
+.sh 1 "Performance"
+.pp
+In order to evaluate the effectiveness of the NQNFS protocol,
+a benchmark was used that was
+designed to typify
+real work on the client workstation.
+Benchmarks, such as Laddis [Wittle93], that perform server load characterization
+are not appropriate for this work, since it is primarily client caching
+efficiency that needs to be evaluated.
+Since these tests are measuring overall client system performance and
+not just the performance of the file system,
+each sequence of runs was performed on identical hardware and operating system in order to factor out the system
+components affecting performance other than the file system protocol.
+.pp
+The equipment used for the all the benchmarks are members of the DECstation\(tm\(dg
+family of workstations using the MIPS\(tm\(sc RISC architecture.
+The operating system running on these systems was a pre-release version of
+4.4BSD Unix\(tm\(dd.
+For all benchmarks, the file server was a DECstation 2100 (10 MIPS) with 8Mbytes of
+memory and a local RZ23 SCSI disk (27msec average access time).
+The clients range in speed from DECstation 2100s
+to a DECstation 5000/25, and always run with six block I/O daemons
+and a 4Mbyte buffer cache, except for the test runs where the
+buffer cache size was the independent variable.
+In all cases /tmp is mounted on the local SCSI disk\**, all machines were
+attached to the same uncongested Ethernet, and ran in single user mode during the benchmarks.
+.(f
+\**Testing using the 4.4BSD MFS [McKusick90] resulted in slightly degraded performance,
+probably since the machines only had 16Mbytes of memory, and so paging
+increased.
+.)f
+Unless noted otherwise, test runs used UDP RPC transport
+and the results given are the average values of four runs.
+.pp
+The benchmark used is the Modified Andrew Benchmark (MAB)
+[Ousterhout90],
+which is a slightly modified version of the benchmark used to characterize
+performance of the Andrew ITC file system [Howard88].
+The MAB was set up with the executable binaries in the remote mounted file
+system and the final load step was commented out, due to a linkage problem
+during testing under 4.4BSD.
+Therefore, these results are not directly comparable to other reported MAB
+results.
+The MAB is made up of five distinct phases:
+.sp
+.ip "1." 10
+Makes five directories (no significant cost)
+.ip "2." 10
+Copy a file system subtree to a working directory
+.ip "3." 10
+Get file attributes (stat) of all the working files
+.ip "4." 10
+Search for strings (grep) in the files
+.ip "5." 10
+Compile a library of C sources and archive them
+.lp
+Of the five phases, the fifth is by far the largest and is the one affected most
+by client caching mechanisms.
+The results for phase #1 are invariant over all
+the caching mechanisms.
+.sh 2 "Buffer Cache Size Tests"
+.pp
+The first experiment was done to see what effect changing the size of the
+buffer cache would have on client performance. A single DECstation 5000/25
+was used to do a series of runs of MAB with different buffer cache sizes
+for four variations of the file system protocol. The four variations are
+as follows:
+.ip "Case 1:" 10
+NFS - The NFS protocol as implemented in 4.4BSD
+.ip "Case 2:" 10
+Leases - The NQNFS protocol using leases for cache consistency
+.ip "Case 3:" 10
+Leases, Rdirlookup - The NQNFS protocol using leases for cache consistency
+and with the readdir RPC replaced by Readdir_and_Lookup
+.ip "Case 4:" 10
+Leases, Attrib leases, Rdirlookup - The NQNFS protocol using leases for
+cache consistency, with the readdir
+RPC replaced by the Readdir_and_Lookup,
+and requiring a valid lease not only for file-data access, but also for file-attribute access.
+.lp
+As can be seen in figure 1, the buffer cache achieves about optimal
+performance for the range of two to ten megabytes in size. At eleven
+megabytes in size, the system pages heavily and the runs did not
+complete in a reasonable time. Even at 64Kbytes, the buffer cache improves
+performance over no buffer cache by a significant margin of 136-148 seconds
+versus 239 seconds.
+This may be due, in part, to the fact that the Compile Phase of the MAB
+uses a rather small working set of file data.
+All variants of NQNFS achieve about
+the same performance, running around 30% faster than NFS, with a slightly
+larger difference for large buffer cache sizes.
+Based on these results, all remaining tests were run with the buffer cache
+size set to 4Mbytes.
+Although I do not know what causes the local peak in the curves between 0.5 and 2 megabytes,
+there is some indication that contention for buffer cache blocks, between the update process
+(which pushes delayed writes to the server every thirty seconds) and the I/O
+system calls, may be involved.
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.188 to 0.963,8.188
+line from 4.787,8.188 to 4.725,8.188
+line from 0.900,8.488 to 0.963,8.488
+line from 4.787,8.488 to 4.725,8.488
+line from 0.900,8.775 to 0.963,8.775
+line from 4.787,8.775 to 4.725,8.775
+line from 0.900,9.075 to 0.963,9.075
+line from 4.787,9.075 to 4.725,9.075
+line from 0.900,9.375 to 0.963,9.375
+line from 4.787,9.375 to 4.725,9.375
+line from 0.900,9.675 to 0.963,9.675
+line from 4.787,9.675 to 4.725,9.675
+line from 0.900,9.963 to 0.963,9.963
+line from 4.787,9.963 to 4.725,9.963
+line from 0.900,10.262 to 0.963,10.262
+line from 4.787,10.262 to 4.725,10.262
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.613,7.888 to 1.613,7.950
+line from 1.613,10.262 to 1.613,10.200
+line from 2.312,7.888 to 2.312,7.950
+line from 2.312,10.262 to 2.312,10.200
+line from 3.025,7.888 to 3.025,7.950
+line from 3.025,10.262 to 3.025,10.200
+line from 3.725,7.888 to 3.725,7.950
+line from 3.725,10.262 to 3.725,10.200
+line from 4.438,7.888 to 4.438,7.950
+line from 4.438,10.262 to 4.438,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 3.800,8.775 to 4.025,8.775
+line from 0.925,10.088 to 0.925,10.088
+line from 0.925,10.088 to 0.938,9.812
+line from 0.938,9.812 to 0.988,9.825
+line from 0.988,9.825 to 1.075,9.838
+line from 1.075,9.838 to 1.163,9.938
+line from 1.163,9.938 to 1.250,9.838
+line from 1.250,9.838 to 1.613,9.825
+line from 1.613,9.825 to 2.312,9.750
+line from 2.312,9.750 to 3.025,9.713
+line from 3.025,9.713 to 3.725,9.850
+line from 3.725,9.850 to 4.438,9.875
+dashwid = 0.037i
+line dotted from 3.800,8.625 to 4.025,8.625
+line dotted from 0.925,9.912 to 0.925,9.912
+line dotted from 0.925,9.912 to 0.938,9.887
+line dotted from 0.938,9.887 to 0.988,9.713
+line dotted from 0.988,9.713 to 1.075,9.562
+line dotted from 1.075,9.562 to 1.163,9.562
+line dotted from 1.163,9.562 to 1.250,9.562
+line dotted from 1.250,9.562 to 1.613,9.675
+line dotted from 1.613,9.675 to 2.312,9.363
+line dotted from 2.312,9.363 to 3.025,9.375
+line dotted from 3.025,9.375 to 3.725,9.387
+line dotted from 3.725,9.387 to 4.438,9.450
+line dashed from 3.800,8.475 to 4.025,8.475
+line dashed from 0.925,10.000 to 0.925,10.000
+line dashed from 0.925,10.000 to 0.938,9.787
+line dashed from 0.938,9.787 to 0.988,9.650
+line dashed from 0.988,9.650 to 1.075,9.537
+line dashed from 1.075,9.537 to 1.163,9.613
+line dashed from 1.163,9.613 to 1.250,9.800
+line dashed from 1.250,9.800 to 1.613,9.488
+line dashed from 1.613,9.488 to 2.312,9.375
+line dashed from 2.312,9.375 to 3.025,9.363
+line dashed from 3.025,9.363 to 3.725,9.325
+line dashed from 3.725,9.325 to 4.438,9.438
+dashwid = 0.075i
+line dotted from 3.800,8.325 to 4.025,8.325
+line dotted from 0.925,9.963 to 0.925,9.963
+line dotted from 0.925,9.963 to 0.938,9.750
+line dotted from 0.938,9.750 to 0.988,9.662
+line dotted from 0.988,9.662 to 1.075,9.613
+line dotted from 1.075,9.613 to 1.163,9.613
+line dotted from 1.163,9.613 to 1.250,9.700
+line dotted from 1.250,9.700 to 1.613,9.438
+line dotted from 1.613,9.438 to 2.312,9.463
+line dotted from 2.312,9.463 to 3.025,9.312
+line dotted from 3.025,9.312 to 3.725,9.387
+line dotted from 3.725,9.387 to 4.438,9.425
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"20" at 0.825,8.110 rjust
+"40" at 0.825,8.410 rjust
+"60" at 0.825,8.697 rjust
+"80" at 0.825,8.997 rjust
+"100" at 0.825,9.297 rjust
+"120" at 0.825,9.597 rjust
+"140" at 0.825,9.885 rjust
+"160" at 0.825,10.185 rjust
+"0" at 0.900,7.660
+"2" at 1.613,7.660
+"4" at 2.312,7.660
+"6" at 3.025,7.660
+"8" at 3.725,7.660
+"10" at 4.438,7.660
+"Time (sec)" at 0.150,8.997
+"Buffer Cache Size (MBytes)" at 2.837,7.510
+"Figure #1: MAB Phase 5 (compile)" at 2.837,10.335
+"NFS" at 3.725,8.697 rjust
+"Leases" at 3.725,8.547 rjust
+"Leases, Rdirlookup" at 3.725,8.397 rjust
+"Leases, Attrib leases, Rdirlookup" at 3.725,8.247 rjust
+.ps
+.ft
+.PE
+.)z
+.sh 2 "Multiple Client Load Tests"
+.pp
+During preliminary runs of the MAB, it was observed that the server RPC
+counts were reduced significantly by NQNFS as compared to NFS (table 1).
+(Spritely NFS and Ultrix\(tm4.3/NFS numbers were taken from [Mogul93]
+and are not directly comparable, due to numerous differences in the
+experimental setup including deletion of the load step from phase 5.)
+This suggests
+that the NQNFS protocol might scale better with
+respect to the number of clients accessing the server.
+The experiment described in this section
+ran the MAB on from one to ten clients concurrently, to observe the
+effects of heavier server load.
+The clients were started at roughly the same time by pressing all the
+<return> keys together and, although not synchronized beyond that point,
+all clients would finish the test run within about two seconds of each
+other.
+This was not a realistic load of N active clients, but it did
+result in a reproducible increasing client load on the server.
+The results for the four variants
+are plotted in figures 2-5.
+.(z
+.ps -1
+.R
+.TS
+box, center;
+c s s s s s s s
+c c c c c c c c
+l | n n n n n n n.
+Table #1: MAB RPC Counts
+RPC Getattr Read Write Lookup Other GetLease/Open-Close Total
+_
+BSD/NQNFS 277 139 306 575 294 127 1718
+BSD/NFS 1210 506 451 489 238 0 2894
+Spritely NFS 259 836 192 535 306 1467 3595
+Ultrix4.3/NFS 1225 1186 476 810 305 0 4002
+.TE
+.ps
+.)z
+.pp
+For the MAB benchmark, the NQNFS protocol reduces the RPC counts significantly,
+but with a minimum of extra overhead (the GetLease/Open-Close count).
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.225 to 0.963,8.225
+line from 4.787,8.225 to 4.725,8.225
+line from 0.900,8.562 to 0.963,8.562
+line from 4.787,8.562 to 4.725,8.562
+line from 0.900,8.900 to 0.963,8.900
+line from 4.787,8.900 to 4.725,8.900
+line from 0.900,9.250 to 0.963,9.250
+line from 4.787,9.250 to 4.725,9.250
+line from 0.900,9.588 to 0.963,9.588
+line from 4.787,9.588 to 4.725,9.588
+line from 0.900,9.925 to 0.963,9.925
+line from 4.787,9.925 to 4.725,9.925
+line from 0.900,10.262 to 0.963,10.262
+line from 4.787,10.262 to 4.725,10.262
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.613,7.888 to 1.613,7.950
+line from 1.613,10.262 to 1.613,10.200
+line from 2.312,7.888 to 2.312,7.950
+line from 2.312,10.262 to 2.312,10.200
+line from 3.025,7.888 to 3.025,7.950
+line from 3.025,10.262 to 3.025,10.200
+line from 3.725,7.888 to 3.725,7.950
+line from 3.725,10.262 to 3.725,10.200
+line from 4.438,7.888 to 4.438,7.950
+line from 4.438,10.262 to 4.438,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 3.800,8.900 to 4.025,8.900
+line from 1.250,8.325 to 1.250,8.325
+line from 1.250,8.325 to 1.613,8.500
+line from 1.613,8.500 to 2.312,8.825
+line from 2.312,8.825 to 3.025,9.175
+line from 3.025,9.175 to 3.725,9.613
+line from 3.725,9.613 to 4.438,10.012
+dashwid = 0.037i
+line dotted from 3.800,8.750 to 4.025,8.750
+line dotted from 1.250,8.275 to 1.250,8.275
+line dotted from 1.250,8.275 to 1.613,8.412
+line dotted from 1.613,8.412 to 2.312,8.562
+line dotted from 2.312,8.562 to 3.025,9.088
+line dotted from 3.025,9.088 to 3.725,9.375
+line dotted from 3.725,9.375 to 4.438,10.000
+line dashed from 3.800,8.600 to 4.025,8.600
+line dashed from 1.250,8.250 to 1.250,8.250
+line dashed from 1.250,8.250 to 1.613,8.438
+line dashed from 1.613,8.438 to 2.312,8.637
+line dashed from 2.312,8.637 to 3.025,9.088
+line dashed from 3.025,9.088 to 3.725,9.525
+line dashed from 3.725,9.525 to 4.438,10.075
+dashwid = 0.075i
+line dotted from 3.800,8.450 to 4.025,8.450
+line dotted from 1.250,8.262 to 1.250,8.262
+line dotted from 1.250,8.262 to 1.613,8.425
+line dotted from 1.613,8.425 to 2.312,8.613
+line dotted from 2.312,8.613 to 3.025,9.137
+line dotted from 3.025,9.137 to 3.725,9.512
+line dotted from 3.725,9.512 to 4.438,9.988
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"20" at 0.825,8.147 rjust
+"40" at 0.825,8.485 rjust
+"60" at 0.825,8.822 rjust
+"80" at 0.825,9.172 rjust
+"100" at 0.825,9.510 rjust
+"120" at 0.825,9.847 rjust
+"140" at 0.825,10.185 rjust
+"0" at 0.900,7.660
+"2" at 1.613,7.660
+"4" at 2.312,7.660
+"6" at 3.025,7.660
+"8" at 3.725,7.660
+"10" at 4.438,7.660
+"Time (sec)" at 0.150,8.997
+"Number of Clients" at 2.837,7.510
+"Figure #2: MAB Phase 2 (copying)" at 2.837,10.335
+"NFS" at 3.725,8.822 rjust
+"Leases" at 3.725,8.672 rjust
+"Leases, Rdirlookup" at 3.725,8.522 rjust
+"Leases, Attrib leases, Rdirlookup" at 3.725,8.372 rjust
+.ps
+.ft
+.PE
+.)z
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.188 to 0.963,8.188
+line from 4.787,8.188 to 4.725,8.188
+line from 0.900,8.488 to 0.963,8.488
+line from 4.787,8.488 to 4.725,8.488
+line from 0.900,8.775 to 0.963,8.775
+line from 4.787,8.775 to 4.725,8.775
+line from 0.900,9.075 to 0.963,9.075
+line from 4.787,9.075 to 4.725,9.075
+line from 0.900,9.375 to 0.963,9.375
+line from 4.787,9.375 to 4.725,9.375
+line from 0.900,9.675 to 0.963,9.675
+line from 4.787,9.675 to 4.725,9.675
+line from 0.900,9.963 to 0.963,9.963
+line from 4.787,9.963 to 4.725,9.963
+line from 0.900,10.262 to 0.963,10.262
+line from 4.787,10.262 to 4.725,10.262
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.613,7.888 to 1.613,7.950
+line from 1.613,10.262 to 1.613,10.200
+line from 2.312,7.888 to 2.312,7.950
+line from 2.312,10.262 to 2.312,10.200
+line from 3.025,7.888 to 3.025,7.950
+line from 3.025,10.262 to 3.025,10.200
+line from 3.725,7.888 to 3.725,7.950
+line from 3.725,10.262 to 3.725,10.200
+line from 4.438,7.888 to 4.438,7.950
+line from 4.438,10.262 to 4.438,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 3.800,8.775 to 4.025,8.775
+line from 1.250,8.975 to 1.250,8.975
+line from 1.250,8.975 to 1.613,8.963
+line from 1.613,8.963 to 2.312,8.988
+line from 2.312,8.988 to 3.025,9.037
+line from 3.025,9.037 to 3.725,9.062
+line from 3.725,9.062 to 4.438,9.100
+dashwid = 0.037i
+line dotted from 3.800,8.625 to 4.025,8.625
+line dotted from 1.250,9.312 to 1.250,9.312
+line dotted from 1.250,9.312 to 1.613,9.287
+line dotted from 1.613,9.287 to 2.312,9.675
+line dotted from 2.312,9.675 to 3.025,9.262
+line dotted from 3.025,9.262 to 3.725,9.738
+line dotted from 3.725,9.738 to 4.438,9.512
+line dashed from 3.800,8.475 to 4.025,8.475
+line dashed from 1.250,9.400 to 1.250,9.400
+line dashed from 1.250,9.400 to 1.613,9.287
+line dashed from 1.613,9.287 to 2.312,9.575
+line dashed from 2.312,9.575 to 3.025,9.300
+line dashed from 3.025,9.300 to 3.725,9.613
+line dashed from 3.725,9.613 to 4.438,9.512
+dashwid = 0.075i
+line dotted from 3.800,8.325 to 4.025,8.325
+line dotted from 1.250,9.400 to 1.250,9.400
+line dotted from 1.250,9.400 to 1.613,9.412
+line dotted from 1.613,9.412 to 2.312,9.700
+line dotted from 2.312,9.700 to 3.025,9.537
+line dotted from 3.025,9.537 to 3.725,9.938
+line dotted from 3.725,9.938 to 4.438,9.812
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"5" at 0.825,8.110 rjust
+"10" at 0.825,8.410 rjust
+"15" at 0.825,8.697 rjust
+"20" at 0.825,8.997 rjust
+"25" at 0.825,9.297 rjust
+"30" at 0.825,9.597 rjust
+"35" at 0.825,9.885 rjust
+"40" at 0.825,10.185 rjust
+"0" at 0.900,7.660
+"2" at 1.613,7.660
+"4" at 2.312,7.660
+"6" at 3.025,7.660
+"8" at 3.725,7.660
+"10" at 4.438,7.660
+"Time (sec)" at 0.150,8.997
+"Number of Clients" at 2.837,7.510
+"Figure #3: MAB Phase 3 (stat/find)" at 2.837,10.335
+"NFS" at 3.725,8.697 rjust
+"Leases" at 3.725,8.547 rjust
+"Leases, Rdirlookup" at 3.725,8.397 rjust
+"Leases, Attrib leases, Rdirlookup" at 3.725,8.247 rjust
+.ps
+.ft
+.PE
+.)z
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.188 to 0.963,8.188
+line from 4.787,8.188 to 4.725,8.188
+line from 0.900,8.488 to 0.963,8.488
+line from 4.787,8.488 to 4.725,8.488
+line from 0.900,8.775 to 0.963,8.775
+line from 4.787,8.775 to 4.725,8.775
+line from 0.900,9.075 to 0.963,9.075
+line from 4.787,9.075 to 4.725,9.075
+line from 0.900,9.375 to 0.963,9.375
+line from 4.787,9.375 to 4.725,9.375
+line from 0.900,9.675 to 0.963,9.675
+line from 4.787,9.675 to 4.725,9.675
+line from 0.900,9.963 to 0.963,9.963
+line from 4.787,9.963 to 4.725,9.963
+line from 0.900,10.262 to 0.963,10.262
+line from 4.787,10.262 to 4.725,10.262
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.613,7.888 to 1.613,7.950
+line from 1.613,10.262 to 1.613,10.200
+line from 2.312,7.888 to 2.312,7.950
+line from 2.312,10.262 to 2.312,10.200
+line from 3.025,7.888 to 3.025,7.950
+line from 3.025,10.262 to 3.025,10.200
+line from 3.725,7.888 to 3.725,7.950
+line from 3.725,10.262 to 3.725,10.200
+line from 4.438,7.888 to 4.438,7.950
+line from 4.438,10.262 to 4.438,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 3.800,8.775 to 4.025,8.775
+line from 1.250,9.412 to 1.250,9.412
+line from 1.250,9.412 to 1.613,9.425
+line from 1.613,9.425 to 2.312,9.463
+line from 2.312,9.463 to 3.025,9.600
+line from 3.025,9.600 to 3.725,9.875
+line from 3.725,9.875 to 4.438,10.075
+dashwid = 0.037i
+line dotted from 3.800,8.625 to 4.025,8.625
+line dotted from 1.250,9.450 to 1.250,9.450
+line dotted from 1.250,9.450 to 1.613,9.438
+line dotted from 1.613,9.438 to 2.312,9.438
+line dotted from 2.312,9.438 to 3.025,9.525
+line dotted from 3.025,9.525 to 3.725,9.550
+line dotted from 3.725,9.550 to 4.438,9.662
+line dashed from 3.800,8.475 to 4.025,8.475
+line dashed from 1.250,9.438 to 1.250,9.438
+line dashed from 1.250,9.438 to 1.613,9.412
+line dashed from 1.613,9.412 to 2.312,9.450
+line dashed from 2.312,9.450 to 3.025,9.500
+line dashed from 3.025,9.500 to 3.725,9.613
+line dashed from 3.725,9.613 to 4.438,9.675
+dashwid = 0.075i
+line dotted from 3.800,8.325 to 4.025,8.325
+line dotted from 1.250,9.387 to 1.250,9.387
+line dotted from 1.250,9.387 to 1.613,9.600
+line dotted from 1.613,9.600 to 2.312,9.625
+line dotted from 2.312,9.625 to 3.025,9.738
+line dotted from 3.025,9.738 to 3.725,9.850
+line dotted from 3.725,9.850 to 4.438,9.800
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"5" at 0.825,8.110 rjust
+"10" at 0.825,8.410 rjust
+"15" at 0.825,8.697 rjust
+"20" at 0.825,8.997 rjust
+"25" at 0.825,9.297 rjust
+"30" at 0.825,9.597 rjust
+"35" at 0.825,9.885 rjust
+"40" at 0.825,10.185 rjust
+"0" at 0.900,7.660
+"2" at 1.613,7.660
+"4" at 2.312,7.660
+"6" at 3.025,7.660
+"8" at 3.725,7.660
+"10" at 4.438,7.660
+"Time (sec)" at 0.150,8.997
+"Number of Clients" at 2.837,7.510
+"Figure #4: MAB Phase 4 (grep/wc/find)" at 2.837,10.335
+"NFS" at 3.725,8.697 rjust
+"Leases" at 3.725,8.547 rjust
+"Leases, Rdirlookup" at 3.725,8.397 rjust
+"Leases, Attrib leases, Rdirlookup" at 3.725,8.247 rjust
+.ps
+.ft
+.PE
+.)z
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.150 to 0.963,8.150
+line from 4.787,8.150 to 4.725,8.150
+line from 0.900,8.412 to 0.963,8.412
+line from 4.787,8.412 to 4.725,8.412
+line from 0.900,8.675 to 0.963,8.675
+line from 4.787,8.675 to 4.725,8.675
+line from 0.900,8.938 to 0.963,8.938
+line from 4.787,8.938 to 4.725,8.938
+line from 0.900,9.213 to 0.963,9.213
+line from 4.787,9.213 to 4.725,9.213
+line from 0.900,9.475 to 0.963,9.475
+line from 4.787,9.475 to 4.725,9.475
+line from 0.900,9.738 to 0.963,9.738
+line from 4.787,9.738 to 4.725,9.738
+line from 0.900,10.000 to 0.963,10.000
+line from 4.787,10.000 to 4.725,10.000
+line from 0.900,10.262 to 0.963,10.262
+line from 4.787,10.262 to 4.725,10.262
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.613,7.888 to 1.613,7.950
+line from 1.613,10.262 to 1.613,10.200
+line from 2.312,7.888 to 2.312,7.950
+line from 2.312,10.262 to 2.312,10.200
+line from 3.025,7.888 to 3.025,7.950
+line from 3.025,10.262 to 3.025,10.200
+line from 3.725,7.888 to 3.725,7.950
+line from 3.725,10.262 to 3.725,10.200
+line from 4.438,7.888 to 4.438,7.950
+line from 4.438,10.262 to 4.438,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 3.800,8.675 to 4.025,8.675
+line from 1.250,8.800 to 1.250,8.800
+line from 1.250,8.800 to 1.613,8.912
+line from 1.613,8.912 to 2.312,9.113
+line from 2.312,9.113 to 3.025,9.438
+line from 3.025,9.438 to 3.725,9.750
+line from 3.725,9.750 to 4.438,10.088
+dashwid = 0.037i
+line dotted from 3.800,8.525 to 4.025,8.525
+line dotted from 1.250,8.637 to 1.250,8.637
+line dotted from 1.250,8.637 to 1.613,8.700
+line dotted from 1.613,8.700 to 2.312,8.713
+line dotted from 2.312,8.713 to 3.025,8.775
+line dotted from 3.025,8.775 to 3.725,8.887
+line dotted from 3.725,8.887 to 4.438,9.037
+line dashed from 3.800,8.375 to 4.025,8.375
+line dashed from 1.250,8.675 to 1.250,8.675
+line dashed from 1.250,8.675 to 1.613,8.688
+line dashed from 1.613,8.688 to 2.312,8.713
+line dashed from 2.312,8.713 to 3.025,8.825
+line dashed from 3.025,8.825 to 3.725,8.887
+line dashed from 3.725,8.887 to 4.438,9.062
+dashwid = 0.075i
+line dotted from 3.800,8.225 to 4.025,8.225
+line dotted from 1.250,8.700 to 1.250,8.700
+line dotted from 1.250,8.700 to 1.613,8.688
+line dotted from 1.613,8.688 to 2.312,8.762
+line dotted from 2.312,8.762 to 3.025,8.812
+line dotted from 3.025,8.812 to 3.725,8.925
+line dotted from 3.725,8.925 to 4.438,9.025
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"50" at 0.825,8.072 rjust
+"100" at 0.825,8.335 rjust
+"150" at 0.825,8.597 rjust
+"200" at 0.825,8.860 rjust
+"250" at 0.825,9.135 rjust
+"300" at 0.825,9.397 rjust
+"350" at 0.825,9.660 rjust
+"400" at 0.825,9.922 rjust
+"450" at 0.825,10.185 rjust
+"0" at 0.900,7.660
+"2" at 1.613,7.660
+"4" at 2.312,7.660
+"6" at 3.025,7.660
+"8" at 3.725,7.660
+"10" at 4.438,7.660
+"Time (sec)" at 0.150,8.997
+"Number of Clients" at 2.837,7.510
+"Figure #5: MAB Phase 5 (compile)" at 2.837,10.335
+"NFS" at 3.725,8.597 rjust
+"Leases" at 3.725,8.447 rjust
+"Leases, Rdirlookup" at 3.725,8.297 rjust
+"Leases, Attrib leases, Rdirlookup" at 3.725,8.147 rjust
+.ps
+.ft
+.PE
+.)z
+.pp
+In figure 2, where a subtree of seventy small files is copied, the difference between the protocol variants is minimal,
+with the NQNFS variants performing slightly better.
+For this case, the Readdir_and_Lookup RPC is a slight hindrance under heavy
+load, possibly because it results in larger directory blocks in the buffer
+cache.
+.pp
+In figure 3, for the phase that gets file attributes for a large number
+of files, the leasing variants take about 50% longer, indicating that
+there are performance problems in this area. For the case where valid
+current leases are required for every file when attributes are returned,
+the performance is significantly worse than when the attributes are allowed
+to be stale by a few seconds on the client.
+I have not been able to explain the oscillation in the curves for the
+Lease cases.
+.pp
+For the string searching phase depicted in figure 4, the leasing variants
+that do not require valid leases for files when attributes are returned
+appear to scale better with server load than NFS.
+However, the effect appears to be
+negligible until the server load is fairly heavy.
+.pp
+Most of the time in the MAB benchmark is spent in the compilation phase
+and this is where the differences between caching methods are most
+pronounced.
+In figure 5 it can be seen that any protocol variant using Leases performs
+about a factor of two better than NFS
+at a load of ten clients. This indicates that the use of NQNFS may
+allow servers to handle significantly more clients for this type of
+workload.
+.pp
+Table 2 summarizes the MAB run times for all phases for the single client
+DECstation 5000/25. The \fILeases\fR case refers to using leases, whereas
+the \fILeases, Rdirl\fR case uses the Readdir_and_Lookup RPC as well and
+the \fIBCache Only\fR case uses leases, but only the buffer cache and not
+the attribute or name caches.
+The \fINo Caching\fR cases does not do any client side caching, performing
+all system calls via synchronous RPCs to the server.
+.(z
+.ps -1
+.R
+.TS
+box, center;
+c s s s s s s
+c c c c c c c c
+l | n n n n n n n.
+Table #2: Single DECstation 5000/25 Client Elapsed Times (sec)
+Phase 1 2 3 4 5 Total % Improvement
+_
+No Caching 6 35 41 40 258 380 -93
+NFS 5 24 15 20 133 197 0
+BCache Only 5 20 24 23 116 188 5
+Leases, Rdirl 5 20 21 20 105 171 13
+Leases 5 19 21 21 99 165 16
+.TE
+.ps
+.)z
+.sh 2 "Processor Speed Tests"
+.pp
+An important goal of client-side file system caching is to decouple the
+I/O system calls from the underlying distributed file system, so that the
+client's system performance might scale with processor speed. In order
+to test this, a series of MAB runs were performed on three
+DECstations that are similar except for processor speed.
+In addition to the four protocol variants used for the above tests, runs
+were done with the client caches turned off, for
+worst case performance numbers for caching mechanisms with a 100% miss rate. The CPU utilization
+was measured, as an indicator of how much the processor was blocking for
+I/O system calls. Note that since the systems were running in single user mode
+and otherwise quiescent, almost all CPU activity was directly related
+to the MAB run.
+The results are presented in
+table 3.
+The CPU time is simply the product of the CPU utilization and
+elapsed running time and, as such, is the optimistic bound on performance
+achievable with an ideal client caching scheme that never blocks for I/O.
+.(z
+.ps -1
+.R
+.TS
+box, center;
+c s s s s s s s s s
+c c s s c s s c s s
+c c c c c c c c c c
+c c c c c c c c c c
+l | n n n n n n n n n.
+Table #3: MAB Phase 5 (compile)
+ DS2100 (10.5 MIPS) DS3100 (14.0 MIPS) DS5000/25 (26.7 MIPS)
+ Elapsed CPU CPU Elapsed CPU CPU Elapsed CPU CPU
+ time Util(%) time time Util(%) time time Util(%) time
+_
+Leases 143 89 127 113 87 98 99 89 88
+Leases, Rdirl 150 89 134 110 91 100 105 88 92
+BCache Only 169 85 144 129 78 101 116 75 87
+NFS 172 77 132 135 74 100 133 71 94
+No Caching 330 47 155 256 41 105 258 39 101
+.TE
+.ps
+.)z
+As can be seen in the table, any caching mechanism achieves significantly
+better performance than when caching is disabled, roughly doubling the CPU
+utilization with a corresponding reduction in run time. For NFS, the CPU
+utilization is dropping with increase in CPU speed, which would suggest that
+it is not scaling with CPU speed. For the NQNFS variants, the CPU utilization
+remains at just below 90%, which suggests that the caching mechanism is working
+well and scaling within this CPU range.
+Note that for this benchmark, the ratio of CPU times for
+the DECstation 3100 and DECstation 5000/25 are quite different than the
+Dhrystone MIPS ratings would suggest.
+.pp
+Overall, the results seem encouraging, although it remains to be seen whether
+or not the caching provided by NQNFS can continue to scale with CPU
+performance.
+There is a good indication that NQNFS permits a server to scale
+to more clients than does NFS, at least for workloads akin to the MAB compile phase.
+A more difficult question is "What if the server is much faster doing
+write RPCs?" as a result of some technology such as Prestoserve
+or write gathering.
+Since a significant part of the difference between NFS and NQNFS is
+the synchronous writing, it is difficult to predict how much a server
+capable of fast write RPCs will negate the performance improvements of NQNFS.
+At the very least, table 1 indicates that the write RPC load on the server
+has decreased by approximately 30%, and this reduced write load should still
+result in some improvement.
+.pp
+Indications are that the Readdir_and_Lookup RPC has not improved performance
+for these tests and may in fact be degrading performance slightly.
+The results in figure 3 indicate some problems, possibly with handling
+of the attribute cache. It seems logical that the Readdir_and_Lookup RPC
+should be permit priming of the attribute cache improving hit rate, but the
+results are counter to that.
+.sh 2 "Internetwork Delay Tests"
+.pp
+This experimental setup was used to explore how the different protocol
+variants might perform over internetworks with larger RPC RTTs. The
+server was moved to a separate Ethernet, using a MicroVAXII\(tm as an
+IP router to the other Ethernet. The 4.3Reno BSD Unix system running on the
+MicroVAXII was modified to delay IP packets being forwarded by a tunable N
+millisecond delay. The implementation was rather crude and did not try to
+simulate a distribution of delay times nor was it programmed to drop packets
+at a given rate, but it served as a simple emulation of a long,
+fat network\** [Jacobson88].
+.(f
+\**Long fat networks refer to network interconnections with
+a Bandwidth X RTT product > 10\u5\d bits.
+.)f
+The MAB was run using both UDP and TCP RPC transports
+for a variety of RTT delays from five to two hundred milliseconds,
+to observe the effects of RTT delay on RPC transport.
+It was found that, due to a high variability between runs, four runs was not
+suffice, so eight runs at each value was done.
+The results in figure 6 and table 4 are the average for the eight runs.
+.(z
+.PS
+.ps
+.ps 10
+dashwid = 0.050i
+line dashed from 0.900,7.888 to 4.787,7.888
+line dashed from 0.900,7.888 to 0.900,10.262
+line from 0.900,7.888 to 0.963,7.888
+line from 4.787,7.888 to 4.725,7.888
+line from 0.900,8.350 to 0.963,8.350
+line from 4.787,8.350 to 4.725,8.350
+line from 0.900,8.800 to 0.963,8.800
+line from 4.787,8.800 to 4.725,8.800
+line from 0.900,9.262 to 0.963,9.262
+line from 4.787,9.262 to 4.725,9.262
+line from 0.900,9.713 to 0.963,9.713
+line from 4.787,9.713 to 4.725,9.713
+line from 0.900,10.175 to 0.963,10.175
+line from 4.787,10.175 to 4.725,10.175
+line from 0.900,7.888 to 0.900,7.950
+line from 0.900,10.262 to 0.900,10.200
+line from 1.825,7.888 to 1.825,7.950
+line from 1.825,10.262 to 1.825,10.200
+line from 2.750,7.888 to 2.750,7.950
+line from 2.750,10.262 to 2.750,10.200
+line from 3.675,7.888 to 3.675,7.950
+line from 3.675,10.262 to 3.675,10.200
+line from 4.600,7.888 to 4.600,7.950
+line from 4.600,10.262 to 4.600,10.200
+line from 0.900,7.888 to 4.787,7.888
+line from 4.787,7.888 to 4.787,10.262
+line from 4.787,10.262 to 0.900,10.262
+line from 0.900,10.262 to 0.900,7.888
+line from 4.125,8.613 to 4.350,8.613
+line from 0.988,8.400 to 0.988,8.400
+line from 0.988,8.400 to 1.637,8.575
+line from 1.637,8.575 to 2.375,8.713
+line from 2.375,8.713 to 3.125,8.900
+line from 3.125,8.900 to 3.862,9.137
+line from 3.862,9.137 to 4.600,9.425
+dashwid = 0.037i
+line dotted from 4.125,8.463 to 4.350,8.463
+line dotted from 0.988,8.375 to 0.988,8.375
+line dotted from 0.988,8.375 to 1.637,8.525
+line dotted from 1.637,8.525 to 2.375,8.850
+line dotted from 2.375,8.850 to 3.125,8.975
+line dotted from 3.125,8.975 to 3.862,9.137
+line dotted from 3.862,9.137 to 4.600,9.625
+line dashed from 4.125,8.312 to 4.350,8.312
+line dashed from 0.988,8.525 to 0.988,8.525
+line dashed from 0.988,8.525 to 1.637,8.688
+line dashed from 1.637,8.688 to 2.375,8.838
+line dashed from 2.375,8.838 to 3.125,9.150
+line dashed from 3.125,9.150 to 3.862,9.275
+line dashed from 3.862,9.275 to 4.600,9.588
+dashwid = 0.075i
+line dotted from 4.125,8.162 to 4.350,8.162
+line dotted from 0.988,8.525 to 0.988,8.525
+line dotted from 0.988,8.525 to 1.637,8.838
+line dotted from 1.637,8.838 to 2.375,8.863
+line dotted from 2.375,8.863 to 3.125,9.137
+line dotted from 3.125,9.137 to 3.862,9.387
+line dotted from 3.862,9.387 to 4.600,10.200
+.ps
+.ps -1
+.ft
+.ft I
+"0" at 0.825,7.810 rjust
+"100" at 0.825,8.272 rjust
+"200" at 0.825,8.722 rjust
+"300" at 0.825,9.185 rjust
+"400" at 0.825,9.635 rjust
+"500" at 0.825,10.097 rjust
+"0" at 0.900,7.660
+"50" at 1.825,7.660
+"100" at 2.750,7.660
+"150" at 3.675,7.660
+"200" at 4.600,7.660
+"Time (sec)" at 0.150,8.997
+"Round Trip Delay (msec)" at 2.837,7.510
+"Figure #6: MAB Phase 5 (compile)" at 2.837,10.335
+"Leases,UDP" at 4.050,8.535 rjust
+"Leases,TCP" at 4.050,8.385 rjust
+"NFS,UDP" at 4.050,8.235 rjust
+"NFS,TCP" at 4.050,8.085 rjust
+.ps
+.ft
+.PE
+.)z
+.(z
+.ps -1
+.R
+.TS
+box, center;
+c s s s s s s s s
+c c s c s c s c s
+c c c c c c c c c
+c c c c c c c c c
+l | n n n n n n n n.
+Table #4: MAB Phase 5 (compile) for Internetwork Delays
+ NFS,UDP NFS,TCP Leases,UDP Leases,TCP
+Delay Elapsed Standard Elapsed Standard Elapsed Standard Elapsed Standard
+(msec) time (sec) Deviation time (sec) Deviation time (sec) Deviation time (sec) Deviation
+_
+5 139 2.9 139 2.4 112 7.0 108 6.0
+40 175 5.1 208 44.5 150 23.8 139 4.3
+80 207 3.9 213 4.7 180 7.7 210 52.9
+120 276 29.3 273 17.1 221 7.7 238 5.8
+160 304 7.2 328 77.1 275 21.5 274 10.1
+200 372 35.0 506 235.1 338 25.2 379 69.2
+.TE
+.ps
+.)z
+.pp
+I found these results somewhat surprising, since I had assumed that stability
+across an internetwork connection would be a function of RPC transport
+protocol.
+Looking at the standard deviations observed between the eight runs, there is an indication
+that the NQNFS protocol plays a larger role in
+maintaining stability than the underlying RPC transport protocol.
+It appears that NFS over TCP transport
+is the least stable variant tested.
+It should be noted that the TCP implementation used was roughly at 4.3BSD Tahoe
+release and that the 4.4BSD TCP implementation was far less stable and would
+fail intermittently, due to a bug I was not able to isolate.
+It would appear that some of the recent enhancements to the 4.4BSD TCP
+implementation have a detrimental effect on the performance of
+RPC-type traffic loads, which intermix small and large
+data transfers in both directions.
+It is obvious that more exploration of this area is needed before any
+conclusions can be made
+beyond the fact that over a local area network, TCP transport provides
+performance comparable to UDP.
+.sh 1 "Lessons Learned"
+.pp
+Evaluating the performance of a distributed file system is fraught with
+difficulties, due to the many software and hardware factors involved.
+The limited benchmarking presented here took a considerable amount of time
+and the results gained by the exercise only give indications of what the
+performance might be for a few scenarios.
+.pp
+The IP router with delay introduction proved to be a valuable tool for protocol debugging\**,
+.(f
+\**It exposed two bugs in the 4.4BSD networking, one a problem in the Lance chip
+driver for the DECstation and the other a TCP window sizing problem that I was
+not able to isolate.
+.)f
+and may be useful for a more extensive study of performance over internetworks
+if enhanced to do a better job of simulating internetwork delay and packet loss.
+.pp
+The Leases mechanism provided a simple model for the provision of cache
+consistency and did seem to improve performance for various scenarios.
+Unfortunately, it does not provide the server state information that is required
+for file system semantics, such as locking, that many software systems demand.
+In production environments on my campus, the need for file locking and the correct
+generation of the ETXTBSY error code
+are far more important that full cache consistency, and leasing
+does not satisfy these needs.
+Another file system semantic that requires hard server state is the delay
+of file removal until the last close system call. Although Spritely NFS
+did not support this semantic either, it is logical that the open file
+state maintained by that system would facilitate the implementation of
+this semantic more easily than would the Leases mechanism.
+.sh 1 "Further Work"
+.pp
+The current implementation uses a fixed, moderate sized buffer cache designed
+for the local UFS [McKusick84] file system.
+The results in figure 1 suggest that this is adequate so long as the cache
+is of an appropriate size.
+However, a mechanism permitting the cache to vary in size
+has been shown to outperform fixed sized buffer caches [Nelson90], and could
+be beneficial. It could also be useful to allow the buffer cache to grow very
+large by making use of local backing store for cases where server performance
+is limited.
+A very large buffer cache size would in turn permit experimentation with
+much larger read/write data sizes, facilitating bulk data transfers
+across long fat networks, such as will characterize the Internet of the
+near future.
+A careful redesign of the buffer cache mechanism to provide
+support for these features would probably be the next implementation step.
+.pp
+The results in figure 3 indicate that the mechanics of caching file
+attributes and maintaining the attribute cache's consistency needs to
+be looked at further.
+There also needs to be more work done on the interaction between a
+Readdir_and_Lookup RPC and the name and attribute caches, in an effort
+to reduce Getattr and Lookup RPC loads.
+.pp
+The NQNFS protocol has never been used in a production environment and doing
+so would provide needed insight into how well the protocol saisfies the
+needs of real workstation environments.
+It is hoped that the distribution of the implementation in 4.4BSD will
+facilitate use of the protocol in production environments elsewhere.
+.pp
+The big question that needs to be resolved is whether Leases are an adequate
+mechanism for cache consistency or whether hard server state is required.
+Given the work presented here and in the papers related to Sprite and Spritely
+NFS, there are clear indications that a cache consistency algorithm can
+improve both performance and file system semantics.
+As yet, however, it is unclear what the best approach to maintain consistency is.
+It would appear that hard state information is required for file locking and
+other mechanisms and, if so, it seems appropriate to use it for cache
+consistency as well.
+.sh 1 "Acknowledgements"
+.pp
+I would like to thank the members of the CSRG at the University of California,
+Berkeley for their continued support over the years. Without their encouragement and assistance this
+software would never have been implemented.
+Prof. Jim Linders and Prof. Tom Wilson here at the University of Guelph helped
+proofread this paper and Jeffrey Mogul provided a great deal of
+assistance, helping to turn my gibberish into something at least moderately
+readable.
+.sh 1 "References"
+.ip [Baker91] 15
+Mary Baker and John Ousterhout, Availability in the Sprite Distributed
+File System, In \fIOperating System Review\fR, (25)2, pg. 95-98,
+April 1991.
+.ip [Baker91a] 15
+Mary Baker, private communication, May 1991.
+.ip [Burrows88] 15
+Michael Burrows, Efficient Data Sharing, Technical Report #153,
+Computer Laboratory, University of Cambridge, Dec. 1988.
+.ip [Gray89] 15
+Cary G. Gray and David R. Cheriton, Leases: An Efficient Fault-Tolerant
+Mechanism for Distributed File Cache Consistency, In \fIProc. of the
+Twelfth ACM Symposium on Operating Systems Principals\fR, Litchfield Park,
+AZ, Dec. 1989.
+.ip [Howard88] 15
+John H. Howard, Michael L. Kazar, Sherri G. Menees, David A. Nichols,
+M. Satyanarayanan, Robert N. Sidebotham and Michael J. West,
+Scale and Performance in a Distributed File System, \fIACM Trans. on
+Computer Systems\fR, (6)1, pg 51-81, Feb. 1988.
+.ip [Jacobson88] 15
+Van Jacobson and R. Braden, \fITCP Extensions for Long-Delay Paths\fR,
+ARPANET Working Group Requests for Comment, DDN Network Information Center,
+SRI International, Menlo Park, CA, October 1988, RFC-1072.
+.ip [Jacobson89] 15
+Van Jacobson, Sun NFS Performance Problems, \fIPrivate Communication,\fR
+November, 1989.
+.ip [Juszczak89] 15
+Chet Juszczak, Improving the Performance and Correctness of an NFS Server,
+In \fIProc. Winter 1989 USENIX Conference,\fR pg. 53-63, San Diego, CA, January 1989.
+.ip [Juszczak94] 15
+Chet Juszczak, Improving the Write Performance of an NFS Server,
+to appear in \fIProc. Winter 1994 USENIX Conference,\fR San Francisco, CA, January 1994.
+.ip [Kazar88] 15
+Michael L. Kazar, Synchronization and Caching Issues in the Andrew File System,
+In \fIProc. Winter 1988 USENIX Conference,\fR pg. 27-36, Dallas, TX, February
+1988.
+.ip [Kent87] 15
+Christopher. A. Kent and Jeffrey C. Mogul, \fIFragmentation Considered Harmful\fR, Research Report 87/3,
+Digital Equipment Corporation Western Research Laboratory, Dec. 1987.
+.ip [Kent87a] 15
+Christopher. A. Kent, \fICache Coherence in Distributed Systems\fR, Research Report 87/4,
+Digital Equipment Corporation Western Research Laboratory, April 1987.
+.ip [Macklem90] 15
+Rick Macklem, Lessons Learned Tuning the 4.3BSD Reno Implementation of the
+NFS Protocol,
+In \fIProc. Winter 1991 USENIX Conference,\fR pg. 53-64, Dallas, TX,
+January 1991.
+.ip [Macklem93] 15
+Rick Macklem, The 4.4BSD NFS Implementation,
+In \fIThe System Manager's Manual\fR, 4.4 Berkeley Software Distribution,
+University of California, Berkeley, June 1993.
+.ip [McKusick84] 15
+Marshall K. McKusick, William N. Joy, Samuel J. Leffler and Robert S. Fabry,
+A Fast File System for UNIX, \fIACM Transactions on Computer Systems\fR,
+Vol. 2, Number 3, pg. 181-197, August 1984.
+.ip [McKusick90] 15
+Marshall K. McKusick, Michael J. Karels and Keith Bostic, A Pageable Memory
+Based Filesystem,
+In \fIProc. Summer 1990 USENIX Conference,\fR pg. 137-143, Anaheim, CA, June
+1990.
+.ip [Mogul93] 15
+Jeffrey C. Mogul, Recovery in Spritely NFS,
+Research Report 93/2, Digital Equipment Corporation Western Research
+Laboratory, June 1993.
+.ip [Moran90] 15
+Joseph Moran, Russel Sandberg, Don Coleman, Jonathan Kepecs and Bob Lyon,
+Breaking Through the NFS Performance Barrier,
+In \fIProc. Spring 1990 EUUG Conference,\fR pg. 199-206, Munich, FRG,
+April 1990.
+.ip [Nelson88] 15
+Michael N. Nelson, Brent B. Welch, and John K. Ousterhout, Caching in the
+Sprite Network File System, \fIACM Transactions on Computer Systems\fR (6)1
+pg. 134-154, February 1988.
+.ip [Nelson90] 15
+Michael N. Nelson, \fIVirtual Memory vs. The File System\fR, Research Report
+90/4, Digital Equipment Corporation Western Research Laboratory, March 1990.
+.ip [Nowicki89] 15
+Bill Nowicki, Transport Issues in the Network File System, In \fIComputer
+Communication Review\fR, pg. 16-20, March 1989.
+.ip [Ousterhout90] 15
+John K. Ousterhout, Why Aren't Operating Systems Getting Faster As Fast as
+Hardware? In \fIProc. Summer 1990 USENIX Conference\fR, pg. 247-256, Anaheim,
+CA, June 1990.
+.ip [Sandberg85] 15
+Russel Sandberg, David Goldberg, Steve Kleiman, Dan Walsh, and Bob Lyon,
+Design and Implementation of the Sun Network filesystem, In \fIProc. Summer
+1985 USENIX Conference\fR, pages 119-130, Portland, OR, June 1985.
+.ip [Srinivasan89] 15
+V. Srinivasan and Jeffrey. C. Mogul, Spritely NFS: Experiments with
+Cache-Consistency Protocols,
+In \fIProc. of the
+Twelfth ACM Symposium on Operating Systems Principals\fR, Litchfield Park,
+AZ, Dec. 1989.
+.ip [Steiner88] 15
+J. G. Steiner, B. C. Neuman and J. I. Schiller, Kerberos: An Authentication
+Service for Open Network Systems,
+In \fIProc. Winter 1988 USENIX Conference,\fR pg. 191-202, Dallas, TX, February
+1988.
+.ip [SUN89] 15
+Sun Microsystems Inc., \fINFS: Network File System Protocol Specification\fR,
+ARPANET Working Group Requests for Comment, DDN Network Information Center,
+SRI International, Menlo Park, CA, March 1989, RFC-1094.
+.ip [SUN93] 15
+Sun Microsystems Inc., \fINFS: Network File System Version 3 Protocol Specification\fR,
+Sun Microsystems Inc., Mountain View, CA, June 1993.
+.ip [Wittle93] 15
+Mark Wittle and Bruce E. Keith, LADDIS: The Next Generation in NFS File
+Server Benchmarking,
+In \fIProc. Summer 1993 USENIX Conference,\fR pg. 111-128, Cincinnati, OH, June
+1993.
+.(f
+\(mo
+NFS is believed to be a trademark of Sun Microsystems, Inc.
+.)f
+.(f
+\(dg
+Prestoserve is a trademark of Legato Systems, Inc.
+.)f
+.(f
+\(sc
+MIPS is a trademark of Silicon Graphics, Inc.
+.)f
+.(f
+\(dg
+DECstation, MicroVAXII and Ultrix are trademarks of Digital Equipment Corp.
+.)f
+.(f
+\(dd
+Unix is a trademark of Novell, Inc.
+.)f
diff --git a/share/doc/papers/px/Makefile b/share/doc/papers/px/Makefile
new file mode 100644
index 000000000000..33bb3f1b2bba
--- /dev/null
+++ b/share/doc/papers/px/Makefile
@@ -0,0 +1,15 @@
+# @(#)Makefile 5.3 (Berkeley) 6/8/93
+
+DIR= papers/px
+SRCS= pxin0.n pxin1.n pxin2.n pxin3.n pxin4.n
+EXTRA= fig1.1.n fig1.2.n fig1.3.n fig2.3.raw fig2.4.n fig3.2.n \
+ fig3.3.n table2.1.n table2.2.n table2.3.n table3.1.n tmac.p
+CLEANFILES+=fig2.3.n
+
+paper.ps: ${SRCS} fig2.3.n
+ ${SOELIM} ${SRCS} | ${TBL} | ${ROFF} > ${.TARGET}
+
+fig2.3.n: fig2.3.raw
+ sort fig2.3.raw >fig2.3.n
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/px/fig1.1.n b/share/doc/papers/px/fig1.1.n
new file mode 100644
index 000000000000..290777ed8ae1
--- /dev/null
+++ b/share/doc/papers/px/fig1.1.n
@@ -0,0 +1,71 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig1.1.n 5.2 (Berkeley) 4/17/91
+.\"
+.KF
+.TS
+center;
+c l
+l l
+_ l
+| l |
+| cw(18) | aw(28)
+| _ | l
+| c | a.
+Base of stack frame
+
+
+
+Block mark Positive offsets
+.sp
+ \(<- Display entry points here
+.sp
+Local
+variables
+.sp
+_ Negative offsets
+Temporary
+expression
+space
+.sp
+.T&
+| _ | l
+c l.
+
+.sp
+Top of stack frame
+.TE
+.sp
+.ce
+Figure 1.1 \- Structure of stack frame
+.sp
+.KE
diff --git a/share/doc/papers/px/fig1.2.n b/share/doc/papers/px/fig1.2.n
new file mode 100644
index 000000000000..4f835b7564f5
--- /dev/null
+++ b/share/doc/papers/px/fig1.2.n
@@ -0,0 +1,68 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig1.2.n 5.2 (Berkeley) 4/17/91
+.\"
+.KF
+.TS
+center;
+l l
+| cw(22n) | aw(20n).
+_ \&
+ Created by \s-2CALL\s0
+Saved lino
+.sp
+Saved lc
+.sp
+Saved dp
+.sp
+_ \&
+ Created by \s-2BEG\s0
+Saved dp contents
+.sp
+Pointer to current
+entry line and
+section name
+.sp
+Current file name
+and buffer
+.sp
+Top of stack reference
+.sp
+.T&
+| _ | l.
+
+.TE
+.sp
+.ce
+Figure 1.2 \- Block mark structure
+.sp
+.KE
diff --git a/share/doc/papers/px/fig1.3.n b/share/doc/papers/px/fig1.3.n
new file mode 100644
index 000000000000..934296f41fe1
--- /dev/null
+++ b/share/doc/papers/px/fig1.3.n
@@ -0,0 +1,60 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig1.3.n 5.2 (Berkeley) 4/17/91
+.\"
+.TS
+center, allbox;
+lw(20).
+T{
+.nf
+.ce 1000
+Space for
+value returned
+from f
+.ce 0
+.fi
+T}
+T{
+.ce
+Value of a
+T}
+T{
+.sp
+.ce
+Block Mark
+.sp
+T}
+.TE
+.sp .1i
+.ce
+Figure 1.3 \- Stack structure on function call `f(a)'
+.sp .1i
diff --git a/share/doc/papers/px/fig2.3.raw b/share/doc/papers/px/fig2.3.raw
new file mode 100644
index 000000000000..07feddf83317
--- /dev/null
+++ b/share/doc/papers/px/fig2.3.raw
@@ -0,0 +1,103 @@
+HALT 2.2 Produce control flow backtrace
+BEG s,W,w," 2.2,1.8 Write second part of block mark, enter block
+END 2.2,1.8 End block execution
+CALL l,A 2.2,1.8 Procedure or function call
+NODUMP s,W,w," 2.2 \s-2BEG\s0 main program, suppress dump
+PUSH s 2.2,1.9 Clear space (for function result)
+POP s 2.2,1.9 Pop (arguments) off stack
+LINO s 2.2 Set line number, count statements
+TRA a 2.2 Short control transfer (local branching)
+TRA4 A 2.2 Long control transfer
+GOTO l,A 2.2,1.8 Non-local goto statement
+IF a 2.3 Conditional transfer
+REL* r 2.3 Relational test yielding Boolean result
+AND 2.4 Boolean and
+OR 2.4 Boolean or
+NOT 2.4 Boolean not
+LRV* l,A 2.5 Right value (load) operators
+RV* l,a 2.5 Right value (load) operators
+CON* v 2.5 Load constant operators
+AS* 2.5 Assignment operators
+OFF s 2.5 Offset address, typically used for field reference
+INX* s,w,w 2.6 Subscripting (indexing) operator
+NIL 2.6 Assert non-nil pointer
+LLV l,W 2.6 Address of operator
+LV l,w 2.6 Address of operator
+IND* 2.6 Indirection operators
+ADD* 2.7 Addition
+SUB* 2.7 Subtraction
+MUL* 2.7 Multiplication
+SQR* 2.7 Squaring
+DIV* 2.7 Fixed division
+MOD* 2.7 Modulus
+ABS* 2.7 Absolute value
+NEG* 2.7 Negation
+DVD* 2.7 Floating division
+RANG* v 2.8 Subrange checking
+CASEOP* 2.9 Case statements
+FOR* a 2.12 For statements
+PXPBUF w 2.10 Initialize \fIpxp\fP count buffer
+TRACNT w,A 2.10 Count a procedure entry
+COUNT w 2.10 Count a statement count point
+CTTOT s,w,w 2.11 Construct set
+CARD s 2.11 Cardinality of set
+STOI 2.12 Convert short to long integer
+STOD 2.12 Convert short integer to real
+ITOD 2.12 Convert integer to real
+ITOS 2.12 Convert integer to short integer
+GET 3.7 Get next record from a file
+PUT 3.8 Output a record to a file
+MESSAGE 3.6 Write to terminal
+FNIL 3.7 Check file initialized, not eof, synced
+FLUSH 3.11 Flush a file
+BUFF 3.11 Specify buffering for file "output"
+EOF 3.10 Returns \fItrue\fR if end of file
+EOLN 3.10 Returns \fItrue\fR if end of line on input text file
+RESET 3.11 Open file for input
+REWRITE 3.11 Open file for output
+REMOVE 3.11 Remove a file
+UNIT* 3.10 Set active file
+READ* 3.7 Read a record from a file
+WRITEC 3.8 Character unformatted write
+WRITEF l 3.8 General formatted write
+WRITES l 3.8 String unformatted write
+WRITLN 3.8 Output a newline to a text file
+PAGE 3.8 Output a formfeed to a text file
+MIN s 3.8 Minimum of top of stack and \fIs\fR
+MAX s,w 3.8 Maximum of top of stack and \fIw\fR
+NAM A 3.8 Convert enumerated type value to print format
+FILE 3.9 Push descriptor for active file
+DEFNAME 3.11 Attach file name for \fBprogram\fR statement files
+PACK s,w,w,w 2.15 Convert and copy from unpacked to packed
+UNPACK s,w,w,w 2.15 Convert and copy from packed to unpacked
+LLIMIT 2.14 Set linelimit for output text file
+ARGC 2.14 Returns number of arguments to current process
+ARGV 2.14 Copy specified process argument into char array
+CLCK 2.14 Returns user time of program
+SCLCK 2.14 Returns system time of program
+WCLCK 2.14 Returns current time stamp
+DATE 2.14 Copy date into char array
+TIME 2.14 Copy time into char array
+SEED 2.13 Set random seed, return old seed
+RANDOM 2.13 Returns random number
+DISPOSE 2.15 Dispose of a heap allocation
+NEW s 2.15 Allocate a record on heap, set pointer to it
+EXPO 2.13 Returns machine representation of real exponent
+ATAN 2.13 Returns arctangent of argument
+EXP 2.13 Returns exponential of argument
+LN 2.13 Returns natural log of argument
+COS 2.13 Returns cos of argument
+SIN 2.13 Returns sin of argument
+SQRT 2.13 Returns square root of argument
+CHR* 2.15 Returns integer to ascii mapping of argument
+ODD* 2.15 Returns \fItrue\fR if argument is odd, \fIfalse\fR if even
+PRED* 2.7 Returns predecessor of argument
+STLIM 2.14 Set program statement limit
+SUCC* 2.7 Returns successor of argument
+ROUND 2.13 Returns \s-2TRUNC\s0(argument + 0.5)
+TRUNC 2.13 Returns integer part of argument
+UNDEF 2.15 Returns \fIfalse\fR
+SDUP 2.2 Duplicate top stack word
+ASRT 2.12 Assert \fItrue\fR to continue
+IN s,w,w 2.11 Set membership
+INCT 2.11 Membership in a constructed set
diff --git a/share/doc/papers/px/fig2.4.n b/share/doc/papers/px/fig2.4.n
new file mode 100644
index 000000000000..d752a0d2267c
--- /dev/null
+++ b/share/doc/papers/px/fig2.4.n
@@ -0,0 +1,57 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig2.4.n 5.2 (Berkeley) 4/17/91
+.\"
+.KF
+.TS
+center, box;
+cw(15).
+\s-2CASEOP\s0
+_
+No. of cases
+_
+.sp
+Case
+transfer
+table
+.sp
+_
+.sp
+Array of case
+label values
+.sp
+.TE
+.sp
+.ce
+Figure 2.4 \- Case data structure
+.sp
+.KE
diff --git a/share/doc/papers/px/fig3.2.n b/share/doc/papers/px/fig3.2.n
new file mode 100644
index 000000000000..d8905a9999b2
--- /dev/null
+++ b/share/doc/papers/px/fig3.2.n
@@ -0,0 +1,56 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig3.2.n 5.2 (Berkeley) 4/17/91
+.\"
+.FK
+.TS
+center, box;
+cw(15).
+No. of cases
+_
+.sp
+offsets
+of element
+names
+.sp
+_
+.sp
+Array of
+null terminated
+element names
+.sp
+.TE
+.sp
+.ce
+Figure 3.2 \- Enumerated type conversion structure
+.sp
+.KE
diff --git a/share/doc/papers/px/fig3.3.n b/share/doc/papers/px/fig3.3.n
new file mode 100644
index 000000000000..bf42dab0ed48
--- /dev/null
+++ b/share/doc/papers/px/fig3.3.n
@@ -0,0 +1,57 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)fig3.3.n 5.2 (Berkeley) 4/17/91
+.\"
+.KF
+.TS
+center;
+l l
+l | cw(15) |.
+ _
+\fIbool\fP: 2
+ _
+ 6
+ _
+ 12
+ _
+ 17
+ _
+ "false"
+ _
+ "true"
+ _
+.TE
+.sp
+.ce
+Figure 3.3 \- Boolean type conversion structure
+.sp
+.KE
diff --git a/share/doc/papers/px/pxin0.n b/share/doc/papers/px/pxin0.n
new file mode 100644
index 000000000000..18edfc6e3fbb
--- /dev/null
+++ b/share/doc/papers/px/pxin0.n
@@ -0,0 +1,140 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)pxin0.n 5.2 (Berkeley) 4/17/91
+.\"
+.if !\n(xx .so tmac.p
+.RP
+.TL
+Berkeley Pascal
+PX Implementation Notes
+.br
+Version 2.0 \- January, 1979
+.AU
+William N. Joy\*(Dg
+.AU
+M. Kirk McKusick\*(Dd
+.AI
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+.PP
+Berkeley Pascal
+is designed for interactive instructional use and runs on the
+.SM "VAX 11/780" .
+The interpreter
+.I px
+executes the Pascal binaries generated by the Pascal translator
+.I pi .
+.PP
+The
+.I
+PX Implementation Notes
+.R
+describe the general organization of
+.I px ,
+detail the various operations of the interpreter,
+and describe the file input/output structure.
+Conclusions are given on the viability of an interpreter
+based approach to language implementation for an instructional environment.
+.AE
+.if n 'ND
+.SH
+Introduction
+.PP
+These
+.I
+PX Implementation Notes
+.R
+have been updated from the original
+.SM "PDP 11/70"
+implementation notes to reflect the interpreter that runs on the
+.SM "VAX 11/780" .
+These notes consist of four major parts.
+The first part outlines the general organization of
+.I px .
+Section 2 describes the operations (instructions) of the interpreter
+while section 3 focuses on input/output related activity.
+A final section gives conclusions about the viability of an interpreter
+based approach to language implementation for instruction.
+.SH
+Related Berkeley Pascal documents
+.PP
+The
+.I "PXP Implementation Notes"
+give details of the internals of the execution profiler
+.I pxp;
+parts of the interpreter related to
+.I pxp
+are discussed in section 2.10.
+A paper describing the syntactic error recovery mechanism used in
+.I pi
+was presented at the ACM Conference on Compiler Construction
+in Boulder Colorado in August, 1979.
+.SH
+Acknowledgements
+.PP
+This version of
+.I px
+is a
+.SM "PDP 11/70"
+to
+.SM "VAX 11/780"
+opcode mapping of the original
+.I px
+that was designed and implemented by Ken Thompson,
+with extensive modifications and additions
+by William Joy
+and Charles Haley.
+Without their work, this
+.UP
+system would never have existed.
+These notes were first written by William Joy for the
+.SM "PDP 11/70"
+implementation.
+We would also like to thank our faculty advisor Susan L. Graham
+for her encouragement,
+her helpful comments and suggestions
+relating to
+.UP
+and her excellent editorial assistance.
+.FS
+\*(dg\ The financial support of the National Science Foundation under grants
+MCS74-07644-A03 and MCS78-07291
+and of an \s-2IBM\s0 Graduate Fellowship are gratefully acknowledged.
+.FE
+.FS
+\*(dd\ The financial support of a Howard Hughes Graduate
+Fellowship is gratefully acknowledged.
+.FE
+.bp
diff --git a/share/doc/papers/px/pxin1.n b/share/doc/papers/px/pxin1.n
new file mode 100644
index 000000000000..9a2c256a2c5b
--- /dev/null
+++ b/share/doc/papers/px/pxin1.n
@@ -0,0 +1,538 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)pxin1.n 5.2 (Berkeley) 4/17/91
+.\"
+.if !\n(xx .so tmac.p
+.tr _\(ru
+.nr H1 0
+.NH
+Organization
+.PP
+Most of
+.I px
+is written in the
+.SM "VAX 11/780"
+assembly language, using the
+.UX
+assembler
+.I as.
+Portions of
+.I px
+are also written in the
+.UX
+systems programming language C.
+.I Px
+consists of a main procedure that reads in the interpreter code,
+a main interpreter loop that transfers successively to various
+code segments implementing the abstract machine operations,
+built-in procedures and functions,
+and several routines that support the implementation of the
+Pascal input-output environment.
+.PP
+The interpreter runs at a fraction of the speed of equivalent
+compiled C code, with this fraction varying from 1/5 to 1/15.
+The interpreter occupies 18.5K bytes of instruction space, shared among
+all processes executing Pascal, and has 4.6K bytes of data space (constants,
+error messages, etc.) a copy of which is allocated to each executing process.
+.NH 2
+Format of the object file
+.PP
+.I Px
+normally interprets the code left in an object file by a run of the
+Pascal translator
+.I pi.
+The file where the translator puts the object originally, and the most
+commonly interpreted file, is called
+.I obj.
+In order that all persons using
+.I px
+share a common text image, this executable file is
+a small process that coordinates with the interpreter to start
+execution.
+The interpreter code is placed
+at the end of a special ``header'' file and the size of the initialized
+data area of this header file is expanded to include this code,
+so that during execution it is located at an
+easily determined address in its data space.
+When executed, the object process creates a
+.I pipe ,
+creates another process by doing a
+.I fork ,
+and arranges that the resulting parent process becomes an instance of
+.I px .
+The child process then writes the interpreter code through
+the pipe that it has to the
+interpreter process parent.
+When this process is complete, the child exits.
+.PP
+The real advantage of this approach is that it does not require modifications
+to the shell, and that the resultant objects are ``true objects'' not
+requiring special treatment.
+A simpler mechanism would be to determine the name of the file that was
+executed and pass this to the interpreter.
+However it is not possible to determine this name
+in all cases.\*(Dd
+.FS
+\*(dd\ For instance, if the
+.I pxref
+program is placed in the directory
+`/usr/bin'
+then when the user types
+``pxref program.p''
+the first argument to the program, nominally the programs name, is
+``pxref.''
+While it would be possible to search in the standard place,
+i.e. the current directory, and the system directories
+`/bin'
+and
+`/usr/bin'
+for a corresponding object file,
+this would be expensive and not guaranteed to succeed.
+Several shells exist that allow other directories to be searched
+for commands, and there is,
+in general,
+no way to determine what these directories are.
+.FE
+.NH 2
+General features of object code
+.PP
+Pascal object code is relocatable as all addressing references for
+control transfers within the code are relative.
+The code consists of instructions interspersed with inline data.
+All instructions have a length that is an even number of bytes.
+No variables are kept in the object code area.
+.PP
+The first byte of a Pascal interpreter instruction contains an operation
+code.
+This allows a total of 256 major operation codes, and 232 of these are
+in use in the current
+.I px.
+The second byte of each interpreter instruction is called the
+``sub-operation code'',
+or more commonly the
+.I sub-opcode.
+It contains a small integer that may, for example, be used as a
+block-structure level for the associated operation.
+If the instruction can take a longword constant,
+this constant is often packed into the sub-opcode
+if it fits into 8 bits and is not zero.
+A sub-opcode value of zero specifies that the constant would not
+fit and therefore follows in the next word.
+This is a space optimization, the value of zero for flagging
+the longer case being convenient because it is easy to test.
+.PP
+Other instruction formats are used.
+The branching
+instructions take an offset in the following word,
+operators that load constants onto the stack
+take arbitrarily long inline constant values,
+and many operations deal exclusively with data on the
+interpreter stack, requiring no inline data.
+.NH 2
+Stack structure of the interpreter
+.PP
+The interpreter emulates a stack-structured Pascal machine.
+The ``load'' instructions put values onto the stack, where all
+arithmetic operations take place.
+The ``store'' instructions take values off the stack
+and place them in an address that is also contained on the stack.
+The only way to move data or to compute in the machine is with the stack.
+.PP
+To make the interpreter operations more powerful
+and to thereby increase the interpreter speed,
+the arithmetic operations in the interpreter are ``typed''.
+That is, length conversion of arithmetic values occurs when they are
+used in an operation.
+This eliminates interpreter cycles for length conversion
+and the associated overhead.
+For example, when adding an integer that fits in one byte to one that
+requires four bytes to store, no ``conversion'' operators are required.
+The one byte integer is loaded onto the stack, followed by the four
+byte integer, and then an adding operator is used that has, implicit
+in its definition, the sizes of the arguments.
+.NH 2
+Data types in the interpreter
+.PP
+The interpreter deals with several different fundamental data types.
+In the memory of the machine, 1, 2, and 4 byte integers are supported,
+with only 2 and 4 byte integers being present on the stack.
+The interpreter always converts to 4 byte integers when there is a possibility
+of overflowing the shorter formats.
+This corresponds to the Pascal language definition of overflow in
+arithmetic operations that requires that the result be correct
+if all partial values lie within the bounds of the base integer type:
+4 byte integer values.
+.PP
+Character constants are treated similarly to 1 byte integers for
+most purposes, as are Boolean values.
+All enumerated types are treated as integer values of
+an appropriate length, usually 1 byte.
+The interpreter also has real numbers, occupying 8 bytes of storage,
+and sets and strings of varying length.
+The appropriate operations are included for each data type, such as
+set union and intersection and an operation to write a string.
+.PP
+No special
+.B packed
+data formats are supported by the interpreter.
+The smallest unit of storage occupied by any variable is one byte.
+The built-ins
+.I pack
+and
+.I unpack
+thus degenerate to simple memory to memory transfers with
+no special processing.
+.NH 2
+Runtime environment
+.PP
+The interpreter runtime environment uses a stack data area and a heap
+data area, that are kept at opposite ends of memory
+and grow towards each other.
+All global variables and variables local to procedures and functions
+are kept in the stack area.
+Dynamically allocated variables and buffers for input/output are
+allocated in the heap.
+.PP
+The addressing of block structured variables is done by using
+a fixed display
+that contains the address of its stack frame
+for each statically active block.\*(Dg
+.FS
+\*(dg\ Here ``block'' is being used to mean any
+.I procedure ,
+.I function
+or the main program.
+.FE
+This display is referenced by instructions that load and store
+variables and maintained by the operations for
+block entry and exit, and for non-local
+.B goto
+statements.
+.NH 2
+Dp, lc, loop
+.PP
+Three ``global'' variables in the interpreter, in addition to the
+``display'', are the
+.I dp,
+.I lc,
+and the
+.I loop.
+The
+.I dp
+is a pointer to the display entry for the current block;
+the
+.I lc
+is the abstract machine location counter;
+and the
+.I loop
+is a register that holds the address of the main interpreter
+loop so that returning to the loop to fetch the next instruction is
+a fast operation.
+.NH 2
+The stack frame structure
+.PP
+Each active block
+has a stack frame consisting of three parts:
+a block mark, local variables, and temporary storage for partially
+evaluated expressions.
+The stack in the interpreter grows from the high addresses in memory
+to the low addresses,
+so that those parts of the stack frame that are ``on the top''
+of the stack have the most negative offsets from the display
+entry for the block.
+The major parts of the stack frame are represented in Figure 1.1.
+.so fig1.1.n
+Note that the local variables of each block
+have negative offsets from the corresponding display entry,
+the ``first'' local variable having offset `\-2'.
+.NH 2
+The block mark
+.PP
+The block mark contains the saved information necessary
+to restore the environment when the current block exits.
+It consists of two parts.
+The first and top-most part is saved by the
+.SM CALL
+instruction in the interpreter.
+This information is not present for the main program
+as it is never ``called''.
+The second part of the block mark is created by the
+.SM BEG
+begin block operator that also allocates and clears the
+local variable storage.
+The format of these blocks is represented in Figure 1.2.
+.sp
+.so fig1.2.n
+.PP
+The data saved by the
+.SM CALL
+operator includes the line number
+.I lino
+of the point of call,
+that is printed if the program execution ends abnormally;
+the location counter
+.I lc
+giving the return address;
+and the current display entry address
+.I dp
+at the time of call.
+.PP
+The
+.SM BEG
+begin operator saves the previous display contents at the level
+of this block, so that the display can be restored on block exit.
+A pointer to the beginning line number and the
+name of this block is also saved.
+This information is stored in the interpreter object code in-line after the
+.SM BEG
+operator.
+It is used in printing a post-mortem backtrace.
+The saved file name and buffer reference are necessary because of
+the input/output structure
+(this is discussed in detail in
+sections 3.3 and 3.4).
+The top of stack reference gives the value the stack pointer should
+have when there are no expression temporaries on the stack.
+It is used for a consistency check in the
+.SM LINO
+line number operators in the interpreter, that occurs before
+each statement executed.
+This helps to catch bugs in the interpreter, that often manifest
+themselves by leaving the stack non-empty between statements.
+.PP
+Note that there is no explicit static link here.
+Thus to set up the display correctly after a non-local
+.B goto
+statement one must ``unwind''
+through all the block marks on the stack to rebuild the display.
+.NH 2
+Arguments and return values
+.PP
+A function returns its value into a space reserved by the calling
+block.
+Arguments to a
+.B function
+are placed on top of this return area.
+For both
+.B procedure
+and
+.B function
+calls, arguments are placed at the end of the expression evaluation area
+of the caller.
+When a
+.B function
+completes, expression evaluation can continue
+after popping the arguments to the
+.B function
+off the stack,
+exactly as if the function value had been ``loaded''.
+The arguments to a
+.B procedure
+are also popped off the stack by the caller
+after its execution ends.
+.KS
+.PP
+As a simple example consider the following stack structure
+for a call to a function
+.I f,
+of the form ``f(a)''.
+.so fig1.3.n
+.KE
+.PP
+If we suppose that
+.I f
+returns a
+.I real
+and that
+.I a
+is an integer,
+the calling sequence for this function would be:
+.DS
+.TS
+lp-2w(8) l.
+PUSH \-8
+RV4:\fIl a\fR
+CALL:\fIl f\fR
+POP 4
+.TE
+.DE
+.ZP
+Here we use the operator
+.SM PUSH
+to clear space for the return value,
+load
+.I a
+on the stack with a ``right value'' operator,
+call the function,
+pop off the argument
+.I a ,
+and can then complete evaluation of the containing expression.
+The operations used here will be explained in section 2.
+.PP
+If the function
+.I f
+were given by
+.LS
+ 10 \*bfunction\fR f(i: integer): real;
+ 11 \*bbegin\fR
+ 12 f := i
+ 13 \*bend\fR;
+.LE
+then
+.I f
+would have code sequence:
+.DS
+.TS
+lp-2w(8) l.
+BEG:2 0
+ 11
+ "f"
+LV:\fIl\fR 40
+RV4:\fIl\fR 32
+AS48
+END
+.TE
+.DE
+.ZP
+Here the
+.SM BEG
+operator takes 9 bytes of inline data.
+The first byte specifies the
+length of the function name.
+The second longword specifies the
+amount of local variable storage, here none.
+The succeeding two lines give the line number of the
+.B begin
+and the name of the block
+for error traceback.
+The
+.SM BEG
+operator places a name pointer in the block mark.
+The body of the
+.B function
+first takes an address of the
+.B function
+result variable
+.I f
+using the address of operator
+.SM LV
+.I a .
+The next operation in the interpretation of this function is the loading
+of the value of
+.I i .
+.I I
+is at the level of the
+.B function
+.I f ,
+here symbolically
+.I l,
+and the first variable in the local variable area.
+The
+.B function
+completes by assigning the 4 byte integer on the stack to the 8 byte
+return location, hence the
+.SM AS48
+assignment operator, and then uses the
+.SM END
+operator to exit the current block.
+.NH 2
+The main interpreter loop
+.PP
+The main interpreter loop is simply:
+.DS
+.mD
+iloop:
+ \fBcaseb\fR (lc)+,$0,$255
+ <table of opcode interpreter addresses>
+.DE
+.ZP
+The main opcode is extracted from the first byte of the instruction
+and used to index into the table of opcode interpreter addresses.
+Control is then transferred to the specified location.
+The sub-opcode may be used to index the display,
+as a small constant,
+or to specify one of several relational operators.
+In the cases where a constant is needed, but it
+is not small enough to fit in the byte sub-operator,
+a zero is placed there and the constant follows in the next word.
+Zero is easily tested for,
+as the instruction that fetches the
+sub-opcode sets the condition code flags.
+A construction like:
+.DS
+.mD
+_OPER:
+ \fBcvtbl\fR (lc)+,r0
+ \fBbneq\fR L1
+ \fBcvtwl\fR (lc)+,r0
+L1: ...
+.DE
+is all that is needed to effect this packing of data.
+This technique saves space in the Pascal
+.I obj
+object code.
+.PP
+The address of the instruction at
+.I iloop
+is always contained in the register variable
+.I loop .
+Thus a return to the main interpreter is simply:
+.DS
+ \fBjmp\fR (loop)
+.DE
+that is both quick and occupies little space.
+.NH 2
+Errors
+.PP
+Errors during interpretation fall into three classes:
+.DS
+1) Interpreter detected errors.
+2) Hardware detected errors.
+3) External events.
+.DE
+.PP
+Interpreter detected errors include I/O errors and
+built-in function errors.
+These errors cause a subroutine call to an error routine
+with a single parameter indicating the cause of the error.
+Hardware errors such as range errors and overflows are
+fielded by a special routine that determines the opcode
+that caused the error.
+It then calls the error routine with an appropriate error
+parameter.
+External events include interrupts and system limits such
+as available memory.
+They generate a call to the error routine with an
+appropriate error code.
+The error routine processes the error condition,
+printing an appropriate error message and usually
+a backtrace from the point of the error.
diff --git a/share/doc/papers/px/pxin2.n b/share/doc/papers/px/pxin2.n
new file mode 100644
index 000000000000..0a12b9077bce
--- /dev/null
+++ b/share/doc/papers/px/pxin2.n
@@ -0,0 +1,923 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)pxin2.n 5.2 (Berkeley) 4/17/91
+.\"
+.if !\n(xx .so tmac.p
+.nr H1 1
+.if n .ND
+.NH
+Operations
+.NH 2
+Naming conventions and operation summary
+.PP
+Table 2.1 outlines the opcode typing convention.
+The expression ``a above b'' means that `a' is on top
+of the stack with `b' below it.
+Table 2.3 describes each of the opcodes.
+The character `*' at the end of a name specifies that
+all operations with the root prefix
+before the `*'
+are summarized by one entry.
+Table 2.2 gives the codes used
+to describe the type inline data expected by each instruction.
+.sp 2
+.so table2.1.n
+.sp 2
+.so table2.2.n
+.bp
+.so table2.3.n
+.bp
+.NH 2
+Basic control operations
+.LP
+.SH
+HALT
+.IP
+Corresponds to the Pascal procedure
+.I halt ;
+causes execution to end with a post-mortem backtrace as if a run-time
+error had occurred.
+.SH
+BEG s,W,w,"
+.IP
+Causes the second part of the block mark to be created, and
+.I W
+bytes of local variable space to be allocated and cleared to zero.
+Stack overflow is detected here.
+.I w
+is the first line of the body of this section for error traceback,
+and the inline string (length s) the character representation of its name.
+.SH
+NODUMP s,W,w,"
+.IP
+Equivalent to
+.SM BEG ,
+and used to begin the main program when the ``p''
+option is disabled so that the post-mortem backtrace will be inhibited.
+.SH
+END
+.IP
+Complementary to the operators
+.SM CALL
+and
+.SM BEG ,
+exits the current block, calling the procedure
+.I pclose
+to flush buffers for and release any local files.
+Restores the environment of the caller from the block mark.
+If this is the end for the main program, all files are
+.I flushed,
+and the interpreter is exited.
+.SH
+CALL l,A
+.IP
+Saves the current line number, return address, and active display entry pointer
+.I dp
+in the first part of the block mark, then transfers to the entry point
+given by the relative address
+.I A ,
+that is the beginning of a
+.B procedure
+or
+.B function
+at level
+.I l.
+.SH
+PUSH s
+.IP
+Clears
+.I s
+bytes on the stack.
+Used to make space for the return value of a
+.B function
+just before calling it.
+.SH
+POP s
+.IP
+Pop
+.I s
+bytes off the stack.
+Used after a
+.B function
+or
+.B procedure
+returns to remove the arguments from the stack.
+.SH
+TRA a
+.IP
+Transfer control to relative address
+.I a
+as a local
+.B goto
+or part of a structured statement.
+.SH
+TRA4 A
+.IP
+Transfer control to an absolute address as part of a non-local
+.B goto
+or to branch over procedure bodies.
+.SH
+LINO s
+.IP
+Set current line number to
+.I s.
+For consistency, check that the expression stack is empty
+as it should be (as this is the start of a statement.)
+This consistency check will fail only if there is a bug in the
+interpreter or the interpreter code has somehow been damaged.
+Increment the statement count and if it exceeds the statement limit,
+generate a fault.
+.SH
+GOTO l,A
+.IP
+Transfer control to address
+.I A
+that is in the block at level
+.I l
+of the display.
+This is a non-local
+.B goto.
+Causes each block to be exited as if with
+.SM END ,
+flushing and freeing files with
+.I pclose,
+until the current display entry is at level
+.I l.
+.SH
+SDUP*
+.IP
+Duplicate the word or long on the top of
+the stack.
+This is used mostly for constructing sets.
+See section 2.11.
+.NH 2
+If and relational operators
+.SH
+IF a
+.IP
+The interpreter conditional transfers all take place using this operator
+that examines the Boolean value on the top of the stack.
+If the value is
+.I true ,
+the next code is executed,
+otherwise control transfers to the specified address.
+.SH
+REL* r
+.IP
+These take two arguments on the stack,
+and the sub-operation code specifies the relational operation to
+be done, coded as follows with `a' above `b' on the stack:
+.DS
+.mD
+.TS
+lb lb
+c a.
+Code Operation
+_
+0 a = b
+2 a <> b
+4 a < b
+6 a > b
+8 a <= b
+10 a >= b
+.TE
+.DE
+.IP
+Each operation does a test to set the condition code
+appropriately and then does an indexed branch based on the
+sub-operation code to a test of the condition here specified,
+pushing a Boolean value on the stack.
+.IP
+Consider the statement fragment:
+.DS
+.mD
+\*bif\fR a = b \*bthen\fR
+.DE
+.IP
+If
+.I a
+and
+.I b
+are integers this generates the following code:
+.DS
+.TS
+lp-2w(8) l.
+RV4:\fIl a\fR
+RV4:\fIl b\fR
+REL4 \&=
+IF \fIElse part offset\fR
+.sp
+.T&
+c s.
+\fI\&... Then part code ...\fR
+.TE
+.DE
+.NH 2
+Boolean operators
+.PP
+The Boolean operators
+.SM AND ,
+.SM OR ,
+and
+.SM NOT
+manipulate values on the top of the stack.
+All Boolean values are kept in single bytes in memory,
+or in single words on the stack.
+Zero represents a Boolean \fIfalse\fP, and one a Boolean \fItrue\fP.
+.NH 2
+Right value, constant, and assignment operators
+.SH
+LRV* l,A
+.br
+RV* l,a
+.IP
+The right value operators load values on the stack.
+They take a block number as a sub-opcode and load the appropriate
+number of bytes from that block at the offset specified
+in the following word onto the stack. As an example, consider
+.SM LRV4 :
+.DS
+.mD
+_LRV4:
+ \fBcvtbl\fR (lc)+,r0 #r0 has display index
+ \fBaddl3\fR _display(r0),(lc)+,r1 #r1 has variable address
+ \fBpushl\fR (r1) #put value on the stack
+ \fBjmp\fR (loop)
+.DE
+.IP
+Here the interpreter places the display level in r0.
+It then adds the appropriate display value to the inline offset and
+pushes the value at this location onto the stack.
+Control then returns to the main
+interpreter loop.
+The
+.SM RV*
+operators have short inline data that
+reduces the space required to address the first 32K of
+stack space in each stack frame.
+The operators
+.SM RV14
+and
+.SM RV24
+provide explicit conversion to long as the data
+is pushed.
+This saves the generation of
+.SM STOI
+to align arguments to
+.SM C
+subroutines.
+.SH
+CON* r
+.IP
+The constant operators load a value onto the stack from inline code.
+Small integer values are condensed and loaded by the
+.SM CON1
+operator, that is given by
+.DS
+.mD
+_CON1:
+ \fBcvtbw\fR (lc)+,\-(sp)
+ \fBjmp\fR (loop)
+.DE
+.IP
+Here note that little work was required as the required constant
+was available at (lc)+.
+For longer constants,
+.I lc
+must be incremented before moving the constant.
+The operator
+.SM CON
+takes a length specification in the sub-opcode and can be used to load
+strings and other variable length data onto the stack.
+The operators
+.SM CON14
+and
+.SM CON24
+provide explicit conversion to long as the constant is pushed.
+.SH
+AS*
+.IP
+The assignment operators are similar to arithmetic and relational operators
+in that they take two operands, both in the stack,
+but the lengths given for them specify
+first the length of the value on the stack and then the length
+of the target in memory.
+The target address in memory is under the value to be stored.
+Thus the statement
+.DS
+i := 1
+.DE
+.IP
+where
+.I i
+is a full-length, 4 byte, integer,
+will generate the code sequence
+.DS
+.TS
+lp-2w(8) l.
+LV:\fIl i\fP
+CON1:1
+AS24
+.TE
+.DE
+.IP
+Here
+.SM LV
+will load the address of
+.I i,
+that is really given as a block number in the sub-opcode and an
+offset in the following word,
+onto the stack, occupying a single word.
+.SM CON1 ,
+that is a single word instruction,
+then loads the constant 1,
+that is in its sub-opcode,
+onto the stack.
+Since there are not one byte constants on the stack,
+this becomes a 2 byte, single word integer.
+The interpreter then assigns a length 2 integer to a length 4 integer using
+.SM AS24 \&.
+The code sequence for
+.SM AS24
+is given by:
+.DS
+.mD
+_AS24:
+ \fBincl\fR lc
+ \fBcvtwl\fR (sp)+,*(sp)+
+ \fBjmp\fR (loop)
+.DE
+.IP
+Thus the interpreter gets the single word off the stack,
+extends it to be a 4 byte integer
+gets the target address off the stack,
+and finally stores the value in the target.
+This is a typical use of the constant and assignment operators.
+.NH 2
+Addressing operations
+.SH
+LLV l,W
+.br
+LV l,w
+.IP
+The most common operation done by the interpreter
+is the ``left value'' or ``address of'' operation.
+It is given by:
+.DS
+.mD
+_LLV:
+ \fBcvtbl\fR (lc)+,r0 #r0 has display index
+ \fBaddl3\fR _display(r0),(lc)+,\-(sp) #push address onto the stack
+ \fBjmp\fR (loop)
+.DE
+.IP
+It calculates an address in the block specified in the sub-opcode
+by adding the associated display entry to the
+offset that appears in the following word.
+The
+.SM LV
+operator has a short inline data that reduces the space
+required to address the first 32K of stack space in each call frame.
+.SH
+OFF s
+.IP
+The offset operator is used in field names.
+Thus to get the address of
+.LS
+p^.f1
+.LE
+.IP
+.I pi
+would generate the sequence
+.DS
+.mD
+.TS
+lp-2w(8) l.
+RV:\fIl p\fP
+OFF \fIf1\fP
+.TE
+.DE
+.IP
+where the
+.SM RV
+loads the value of
+.I p,
+given its block in the sub-opcode and offset in the following word,
+and the interpreter then adds the offset of the field
+.I f1
+in its record to get the correct address.
+.SM OFF
+takes its argument in the sub-opcode if it is small enough.
+.SH
+NIL
+.IP
+The example above is incomplete, lacking a check for a
+.B nil
+pointer.
+The code generated would be
+.DS
+.TS
+lp-2w(8) l.
+RV:\fIl p\fP
+NIL
+OFF \fIf1\fP
+.TE
+.DE
+.IP
+where the
+.SM NIL
+operation checks for a
+.I nil
+pointer and generates the appropriate runtime error if it is.
+.SH
+LVCON s,"
+.IP
+A pointer to the specified length inline data is pushed
+onto the stack.
+This is primarily used for
+.I printf
+type strings used by
+.SM WRITEF .
+(see sections 3.6 and 3.8)
+.SH
+INX* s,w,w
+.IP
+The operators
+.SM INX2
+and
+.SM INX4
+are used for subscripting.
+For example, the statement
+.DS
+a[i] := 2.0
+.DE
+.IP
+with
+.I i
+an integer and
+.I a
+an
+``array [1..1000] of real''
+would generate
+.DS
+.TS
+lp-2w(8) l.
+LV:\fIl a\fP
+RV4:\fIl i\fP
+INX4:8 1,999
+CON8 2.0
+AS8
+.TE
+.DE
+.IP
+Here the
+.SM LV
+operation takes the address of
+.I a
+and places it on the stack.
+The value of
+.I i
+is then placed on top of this on the stack.
+The array address is indexed by the
+length 4 index (a length 2 index would use
+.SM INX2 )
+where the individual elements have a size of 8 bytes.
+The code for
+.SM INX4
+is:
+.DS
+.mD
+_INX4:
+ \fBcvtbl\fR (lc)+,r0
+ \fBbneq\fR L1
+ \fBcvtwl\fR (lc)+,r0 #r0 has size of records
+L1:
+ \fBcvtwl\fR (lc)+,r1 #r1 has lower bound
+ \fBmovzwl\fR (lc)+,r2 #r2 has upper-lower bound
+ \fBsubl3\fR r1,(sp)+,r3 #r3 has base subscript
+ \fBcmpl\fR r3,r2 #check for out of bounds
+ \fBbgtru\fR esubscr
+ \fBmull2\fR r0,r3 #calculate byte offset
+ \fBaddl2\fR r3,(sp) #calculate actual address
+ \fBjmp\fR (loop)
+esubscr:
+ \fBmovw\fR $ESUBSCR,_perrno
+ \fBjbr\fR error
+.DE
+.IP
+Here the lower bound is subtracted, and range checked against the
+upper minus lower bound.
+The offset is then scaled to a byte offset into the array
+and added to the base address on the stack.
+Multi-dimension subscripts are translated as a sequence of single subscriptings.
+.SH
+IND*
+.IP
+For indirect references through
+.B var
+parameters and pointers,
+the interpreter has a set of indirection operators that convert a pointer
+on the stack into a value on the stack from that address.
+different
+.SM IND
+operators are necessary because of the possibility of different
+length operands.
+The
+.SM IND14
+and
+.SM IND24
+operators do conversions to long
+as they push their data.
+.NH 2
+Arithmetic operators
+.PP
+The interpreter has many arithmetic operators.
+All operators produce results long enough to prevent overflow
+unless the bounds of the base type are exceeded.
+The basic operators available are
+.DS
+Addition: ADD*, SUCC*
+Subtraction: SUB*, PRED*
+Multiplication: MUL*, SQR*
+Division: DIV*, DVD*, MOD*
+Unary: NEG*, ABS*
+.DE
+.NH 2
+Range checking
+.PP
+The interpreter has several range checking operators.
+The important distinction among these operators is between values whose
+legal range begins at zero and those that do not begin at zero,
+for example
+a subrange variable whose values range from 45 to 70.
+For those that begin at zero, a simpler ``logical'' comparison against
+the upper bound suffices.
+For others, both the low and upper bounds must be checked independently,
+requiring two comparisons.
+On the
+.SM "VAX 11/780"
+both checks are done using a single index instruction
+so the only gain is in reducing the inline data.
+.NH 2
+Case operators
+.PP
+The interpreter includes three operators for
+.B case
+statements that are used depending on the width of the
+.B case
+label type.
+For each width, the structure of the case data is the same, and
+is represented in figure 2.4.
+.sp 1
+.so fig2.4.n
+.PP
+The
+.SM CASEOP
+case statement operators do a sequential search through the
+case label values.
+If they find the label value, they take the corresponding entry
+from the transfer table and cause the interpreter to branch to the
+specified statement.
+If the specified label is not found, an error results.
+.PP
+The
+.SM CASE
+operators take the number of cases as a sub-opcode
+if possible.
+Three different operators are needed to handle single byte,
+word, and long case transfer table values.
+For example, the
+.SM CASEOP1
+operator has the following code sequence:
+.DS
+.mD
+_CASEOP1:
+ \fBcvtbl\fR (lc)+,r0
+ \fBbneq\fR L1
+ \fBcvtwl\fR (lc)+,r0 #r0 has length of case table
+L1:
+ \fBmovaw\fR (lc)[r0],r2 #r2 has pointer to case labels
+ \fBmovzwl\fR (sp)+,r3 #r3 has the element to find
+ \fBlocc\fR r3,r0,(r2) #r0 has index of located element
+ \fBbeql\fR caserr #element not found
+ \fBmnegl\fR r0,r0 #calculate new lc
+ \fBcvtwl\fR (r2)[r0],r1 #r1 has lc offset
+ \fBaddl2\fR r1,lc
+ \fBjmp\fR (loop)
+caserr:
+ \fBmovw\fR $ECASE,_perrno
+ \fBjbr\fR error
+.DE
+.PP
+Here the interpreter first computes the address of the beginning
+of the case label value area by adding twice the number of case label
+values to the address of the transfer table, since the transfer
+table entries are 2 byte address offsets.
+It then searches through the label values, and generates an ECASE
+error if the label is not found.
+If the label is found, the index of the corresponding entry
+in the transfer table is extracted and that offset is added
+to the interpreter location counter.
+.NH 2
+Operations supporting pxp
+.PP
+The following operations are defined to do execution profiling.
+.SH
+PXPBUF w
+.IP
+Causes the interpreter to allocate a count buffer
+with
+.I w
+four byte counters
+and to clear them to zero.
+The count buffer is placed within an image of the
+.I pmon.out
+file as described in the
+.I "PXP Implementation Notes."
+The contents of this buffer are written to the file
+.I pmon.out
+when the program ends.
+.SH
+COUNT w
+.IP
+Increments the counter specified by
+.I w .
+.SH
+TRACNT w,A
+.IP
+Used at the entry point to procedures and functions,
+combining a transfer to the entry point of the block with
+an incrementing of its entry count.
+.NH 2
+Set operations
+.PP
+The set operations:
+union
+.SM ADDT,
+intersection
+.SM MULT,
+element removal
+.SM SUBT,
+and the set relationals
+.SM RELT
+are straightforward.
+The following operations are more interesting.
+.SH
+CARD s
+.IP
+Takes the cardinality of a set of size
+.I s
+bytes on top of the stack, leaving a 2 byte integer count.
+.SM CARD
+uses the
+.B ffs
+opcode to successively count the number of set bits in the set.
+.SH
+CTTOT s,w,w
+.IP
+Constructs a set.
+This operation requires a non-trivial amount of work,
+checking bounds and setting individual bits or ranges of bits.
+This operation sequence is slow,
+and motivates the presence of the operator
+.SM INCT
+below.
+The arguments to
+.SM CTTOT
+include the number of elements
+.I s
+in the constructed set,
+the lower and upper bounds of the set,
+the two
+.I w
+values,
+and a pair of values on the stack for each range in the set, single
+elements in constructed sets being duplicated with
+.SM SDUP
+to form degenerate ranges.
+.SH
+IN s,w,w
+.IP
+The operator
+.B in
+for sets.
+The value
+.I s
+specifies the size of the set,
+the two
+.I w
+values the lower and upper bounds of the set.
+The value on the stack is checked to be in the set on the stack,
+and a Boolean value of
+.I true
+or
+.I false
+replaces the operands.
+.SH
+INCT
+.IP
+The operator
+.B in
+on a constructed set without constructing it.
+The left operand of
+.B in
+is on top of the stack followed by the number of pairs in the
+constructed set,
+and then the pairs themselves, all as single word integers.
+Pairs designate runs of values and single values are represented by
+a degenerate pair with both value equal.
+This operator is generated in grammatical constructs such as
+.LS
+\fBif\fR character \fBin\fR [`+', '\-', `*', `/']
+.LE
+.IP
+or
+.LS
+\fBif\fR character \fBin\fR [`a'..`z', `$', `_']
+.LE
+.IP
+These constructs are common in Pascal, and
+.SM INCT
+makes them run much faster in the interpreter,
+as if they were written as an efficient series of
+.B if
+statements.
+.NH 2
+Miscellaneous
+.PP
+Other miscellaneous operators that are present in the interpreter
+are
+.SM ASRT
+that causes the program to end if the Boolean value on the stack is not
+.I true,
+and
+.SM STOI ,
+.SM STOD ,
+.SM ITOD ,
+and
+.SM ITOS
+that convert between different length arithmetic operands for
+use in aligning the arguments in
+.B procedure
+and
+.B function
+calls, and with some untyped built-ins, such as
+.SM SIN
+and
+.SM COS \&.
+.PP
+Finally, if the program is run with the run-time testing disabled, there
+are special operators for
+.B for
+statements
+and special indexing operators for arrays
+that have individual element size that is a power of 2.
+The code can run significantly faster using these operators.
+.NH 2
+Mathematical Functions
+.PP
+The transcendental functions
+.SM SIN ,
+.SM COS ,
+.SM ATAN ,
+.SM EXP ,
+.SM LN ,
+.SM SQRT ,
+.SM SEED ,
+and
+.SM RANDOM
+are taken from the standard UNIX
+mathematical package.
+These functions take double precision floating point
+values and return the same.
+.PP
+The functions
+.SM EXPO ,
+.SM TRUNC ,
+and
+.SM ROUND
+take a double precision floating point number.
+.SM EXPO
+returns an integer representing the machine
+representation of its argument's exponent,
+.SM TRUNC
+returns the integer part of its argument, and
+.SM ROUND
+returns the rounded integer part of its argument.
+.NH 2
+System functions and procedures
+.SH
+LLIMIT
+.IP
+A line limit and a file pointer are passed on the stack.
+If the limit is non-negative the line limit is set to the
+specified value, otherwise it is set to unlimited.
+The default is unlimited.
+.SH
+STLIM
+.IP
+A statement limit is passed on the stack. The statement limit
+is set as specified.
+The default is 500,000.
+No limit is enforced when the ``p'' option is disabled.
+.SH
+CLCK
+.br
+SCLCK
+.IP
+.SM CLCK
+returns the number of milliseconds of user time used by the program;
+.SM SCLCK
+returns the number of milliseconds of system time used by the program.
+.SH
+WCLCK
+.IP
+The number of seconds since some predefined time is
+returned. Its primary usefulness is in determining
+elapsed time and in providing a unique time stamp.
+.sp
+.LP
+The other system time procedures are
+.SM DATE
+and
+.SM TIME
+that copy an appropriate text string into a pascal string array.
+The function
+.SM ARGC
+returns the number of command line arguments passed to the program.
+The procedure
+.SM ARGV
+takes an index on the stack and copies the specified
+command line argument into a pascal string array.
+.NH 2
+Pascal procedures and functions
+.SH
+PACK s,w,w,w
+.br
+UNPACK s,w,w,w
+.IP
+They function as a memory to memory move with several
+semantic checks.
+They do no ``unpacking'' or ``packing'' in the true sense as the
+interpreter supports no packed data types.
+.SH
+NEW s
+.br
+DISPOSE s
+.IP
+An
+.SM LV
+of a pointer is passed.
+.SM NEW
+allocates a record of a specified size and puts a pointer
+to it into the pointer variable.
+.SM DISPOSE
+deallocates the record pointed to by the pointer
+and sets the pointer to
+.SM NIL .
+.sp
+.LP
+The function
+.SM CHR*
+converts a suitably small integer into an ascii character.
+Its primary purpose is to do a range check.
+The function
+.SM ODD*
+returns
+.I true
+if its argument is odd and returns
+.I false
+if its argument is even.
+The function
+.SM UNDEF
+always returns the value
+.I false .
diff --git a/share/doc/papers/px/pxin3.n b/share/doc/papers/px/pxin3.n
new file mode 100644
index 000000000000..91944603a0a7
--- /dev/null
+++ b/share/doc/papers/px/pxin3.n
@@ -0,0 +1,597 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)pxin3.n 5.2 (Berkeley) 4/17/91
+.\"
+.if !\n(xx .so tmac.p
+.ta 8n 16n 24n
+.nr H1 2
+.if n .ND
+.NH
+Input/output
+.NH 2
+The files structure
+.PP
+Each file in the Pascal environment is represented by a pointer
+to a
+.I files
+structure in the heap.
+At the location addressed by the pointer is the element
+in the file's window variable.
+Behind this window variable is information about the file,
+at the following offsets:
+.so table3.1.n
+.PP
+Here
+.SM FBUF
+is a pointer to the system FILE block for the file.
+The standard system I/O library is
+used that provides block buffered input/output,
+with 1024 characters normally transferred at each read or write.
+.PP
+The files in the
+Pascal environment,
+are all linked together on a single file chain through the
+.SM FCHAIN
+links.
+For each file the
+.SM FLEV
+pointer gives its associated file variable.
+These are used to free files at block exit as described in section 3.3
+below.
+.PP
+The
+FNAME
+and
+PFNAME
+give the associated
+file name for the file and the name to be used when printing
+error diagnostics respectively.
+Although these names are usually the same,
+.I input
+and
+.I output
+usually have no associated
+file name so the distinction is necessary.
+.PP
+The
+FUNIT
+word contains
+a set of flags.
+whose representations are:
+.TS
+center;
+l l l.
+EOF 0x0100 At end-of-file
+EOLN 0x0200 At end-of-line (text files only)
+SYNC 0x0400 File window is out of sync
+TEMP 0x0800 File is temporary
+FREAD 0x1000 File is open for reading
+FWRITE 0x2000 File is open for writing
+FTEXT 0x4000 File is a text file; process EOLN
+FDEF 0x8000 File structure created, but file not opened
+.TE
+.PP
+The
+EOF
+and
+EOLN
+bits here reflect the associated built-in function values.
+TEMP
+specifies that the file has a generated temporary name and that
+it should therefore be removed when its block exits.
+FREAD
+and
+FWRITE
+specify that
+.I reset
+and
+.I rewrite
+respectively have been done on the file so that
+input or output operations can be done.
+FTEXT
+specifies the file is a text file so that
+EOLN
+processing should be done,
+with newline characters turned into blanks, etc.
+.PP
+The
+SYNC
+bit,
+when true,
+specifies that there is no usable image in the file buffer window.
+As discussed in the
+.I "Berkeley Pascal User's Manual,"
+the interactive environment necessitates having
+``input^'' undefined at the beginning
+of execution so that a program may print a prompt
+before the user is required to type input.
+The
+SYNC
+bit implements this.
+When it is set,
+it specifies that the element in the window
+must be updated before it can be used.
+This is never done until necessary.
+.NH 2
+Initialization of files
+.PP
+All the variables in the Pascal runtime environment are cleared to zero on
+block entry.
+This is necessary for simple processing of files.
+If a file is unused, its pointer will be
+.B nil.
+All references to an inactive file are thus references through a
+.B nil
+pointer.
+If the Pascal system did not clear storage to zero before execution
+it would not be possible to detect inactive files in this simple way;
+it would probably be necessary to generate (possibly complicated)
+code to initialize
+each file on block entry.
+.PP
+When a file is first mentioned in a
+.I reset
+or
+.I rewrite
+call,
+a buffer of the form described above is associated with it,
+and the necessary information about the file is placed in this
+buffer.
+The file is also linked into the active file chain.
+This chain is kept sorted by block mark address, the
+FLEV
+entries.
+.NH 2
+Block exit
+.PP
+When block exit occurs the interpreter must free the files that are in
+use in the block
+and their associated buffers.
+This is simple and efficient because the files in the active file chain are
+sorted by increasing block mark address.
+This means that the files for the current block will be at the front
+of the chain.
+For each file that is no longer accessible
+the interpreter first flushes the files buffer
+if it is an output file.
+The interpreter then returns the file buffer and the files structure and window
+to the free space in the heap and removes the file from the active file chain.
+.NH 2
+Flushing
+.PP
+Flushing all the file buffers at abnormal termination,
+or on a call to the procedure
+.I flush
+or
+.I message
+is done by flushing
+each file on the file chain that has the
+FWRITE
+bit set in its flags word.
+.NH 2
+The active file
+.PP
+For input-output,
+.I px
+maintains a notion of an active file.
+Each operation that references a file makes the file
+it will be using the active file and then does its operation.
+A subtle point here is that one may do a procedure call to
+.I write
+that involves a call to a function that references another file,
+thereby destroying the active file set up before the
+.I write.
+Thus the active file is saved at block entry
+in the block mark and restored at block exit.\*(Dg
+.FS
+\*(dg\ It would probably be better to dispense with the notion of
+active file and use another mechanism that did not involve extra
+overhead on each procedure and function call.
+.FE
+.NH 2
+File operations
+.PP
+Files in Pascal can be used in two distinct ways:
+as the object of
+.I read,
+.I write,
+.I get,
+and
+.I put
+calls, or indirectly as though they were pointers.
+The second use as pointers must be careful
+not to destroy the active file in a reference such as
+.LS
+write(output, input\(ua)
+.LE
+or the system would incorrectly write on the input device.
+.PP
+The fundamental operator related to the use of a file is
+.SM FNIL.
+This takes the file variable, as a pointer,
+insures that the pointer is not
+.B nil,
+and also that a usable image is in the file window,
+by forcing the
+.SM SYNC
+bit to be cleared.
+.PP
+A simple example that demonstrates the use of the file operators
+is given by
+.LS
+writeln(f)
+.LE
+that produces
+.DS
+.mD
+.TS
+lp-2w(8) l.
+RV:\fIl f\fP
+UNIT
+WRITLN
+.TE
+.DE
+.NH 2
+Read operations
+.SH
+GET
+.IP
+Advance the active file to the next input element.
+.SH
+FNIL
+.IP
+A file pointer is on the stack. Insure that the associated file is active
+and that the file is synced so that there is input available in the window.
+.SH
+READ*
+.IP
+If the file is a text file, read a block of text
+and convert it to the internal type of the specified
+operand. If the file is not a text file then
+do an unformatted read of the next record.
+The procedure
+.SM READLN
+reads upto and including the next end of line character.
+.SH
+READE A
+.IP
+The operator
+.SM READE
+reads a string name of an enumerated type and converts it
+to its internal value.
+.SM READE
+takes a pointer to a data structure as shown in figure 3.2.
+.so fig3.2.n
+See the description of
+.SM NAM
+in the next section for an example.
+.NH 2
+Write operations
+.SH
+PUT
+.IP
+Output the element in the active file window.
+.SH
+WRITEF s
+.IP
+The argument(s) on the stack are output
+by the
+.I fprintf
+standard
+.SM I/O
+library routine.
+The sub-opcode
+.I s
+specifies the number
+of longword arguments on the stack.
+.SH
+WRITEC
+.IP
+The character on the top of the stack is output
+without formatting. Formatted characters must be output with
+.SM WRITEF .
+.SH
+WRITES
+.IP
+The string specified by the pointer on the top of the stack is output
+by the
+.I fwrite
+standard
+.SM I/O
+library routine.
+All characters including nulls are printed.
+.SH
+WRITLN
+.IP
+A linefeed is output to the active file.
+The line-count for the file is
+incremented and checked against the line limit.
+.SH
+PAGE
+.IP
+A formfeed is output to the active file.
+.SH
+NAM A
+.IP
+The value on the top of the stack is converted to a pointer
+to an enumerated type string name.
+The address
+.SM A
+points to an enumerated type structure identical
+to that used by
+.SM READE .
+An error is raised if the value is out of range.
+The form of this structure for the predefined type
+.B boolean
+is shown in figure 3.3.
+.so fig3.3.n
+The code for
+.SM NAM
+is
+.DS
+.mD
+_NAM:
+ \fBincl\fR lc
+ \fBaddl3\fR (lc)+,ap,r6 #r6 points to scalar name list
+ \fBmovl\fR (sp)+,r3 #r3 has data value
+ \fBcmpw\fR r3,(r6)+ #check value out of bounds
+ \fBbgequ\fR enamrng
+ \fBmovzwl\fR (r6)[r3],r4 #r4 has string index
+ \fBpushab\fR (r6)[r4] #push string pointer
+ \fBjmp\fR (loop)
+enamrng:
+ \fBmovw\fR $ENAMRNG,_perrno
+ \fBjbr\fR error
+.DE
+The address of the table is calculated by adding the base address
+of the interpreter code,
+.I ap
+to the offset pointed to by
+.I lc .
+The first word of the table gives the number of records and
+provides a range check of the data to be output.
+The pointer is then calculated as
+.DS
+.mD
+tblbase = ap + A;
+size = *tblbase++;
+return(tblbase + tblbase[value]);
+.DE
+.SH
+MAX s,w
+.IP
+The sub-opcode
+.I s
+is subtracted from the integer on the top of the stack.
+The maximum of the result and the second argument,
+.I w ,
+replaces the value on the top of the stack.
+This function verifies that variable specified
+width arguments are non-negative, and meet certain minimum width
+requirements.
+.SH
+MIN s
+.IP
+The minimum of the value on the top of the stack
+and the sub-opcode replaces the value on the top
+of the stack.
+.sp 1
+.LP
+The uses of files and the file operations are summarized
+in an example which outputs a real variable (r) with a variable
+width field (i).
+.LS
+writeln('r =',r:i,' ',true);
+.LE
+that generates the code
+.DS
+.mD
+.TS
+lp-2w(8) l.
+UNITOUT
+FILE
+CON14:1
+CON14:3
+LVCON:4 "r ="
+WRITES
+RV8\fI:l r\fP
+RV4\fI:l i\fP
+MAX:8 1
+RV4\fI:l i\fP
+MAX:1 1
+LVCON:8 " %*.*E"
+FILE
+WRITEF:6
+CONC4 \' \'
+WRITEC
+CON14:1
+NAM \fIbool\fP
+LVCON:4 "%s"
+FILE
+WRITEF:3
+WRITLN
+.TE
+.DE
+.PP
+Here the operator
+.SM UNITOUT
+is an abbreviated form of the operator
+.SM UNIT
+that is used when the file to be made active is
+.I output .
+A file descriptor, record count, string size, and a pointer
+to the constant string ``r ='' are pushed
+and then output by
+.SM WRITES .
+Next the value of
+.I r
+is pushed on the stack
+and the precision size is calculated by taking
+seven less than the width, but not less than one.
+This is followed by the width that is reduced by
+one to leave space for the required leading blank.
+If the width is too narrow, it
+is expanded by
+.I fprintf .
+A pointer to the format string is pushed followed
+by a file descriptor and the operator
+.SM WRITEF
+that prints out
+.I r .
+The value of six on
+.SM WRITEF
+comes from two longs for
+.I r
+and a long each for the precision, width, format string pointer,
+and file descriptor.
+The operator
+.SM CONC4
+pushes the
+.I blank
+character onto a long on the stack that is then printed out by
+.SM WRITEC .
+The internal representation for
+.I true
+is pushed as a long onto the stack and is
+then replaced by a pointer to the string ``true''
+by the operator
+.SM NAM
+using the table
+.I bool
+for conversion.
+This string is output by the operator
+.SM WRITEF
+using the format string ``%s''.
+Finally the operator
+.SM WRITLN
+appends a newline to the file.
+.NH 2
+File activation and status operations
+.SH
+UNIT*
+.IP
+The file pointed to by the file pointer on the top
+of the stack is converted to be the active file.
+The opcodes
+.SM UNITINP
+and
+.SM UNITOUT
+imply standard input and output respectively
+instead of explicitly pushing their file pointers.
+.SH
+FILE
+.IP
+The standard
+.SM I/O
+library file descriptor associated with the active file
+is pushed onto the stack.
+.SH
+EOF
+.IP
+The file pointed to by the file pointer on the top
+of the stack is checked for end of file. A boolean
+is returned with
+.I true
+indicating the end of file condition.
+.SH
+EOLN
+.IP
+The file pointed to by the file pointer on the top
+of the stack is checked for end of line. A boolean
+is returned with
+.I true
+indicating the end of line condition.
+Note that only text files can check for end of line.
+.NH 2
+File housekeeping operations
+.SH
+DEFNAME
+.IP
+Four data items are passed on the stack;
+the size of the data type associated with the file,
+the maximum size of the file name,
+a pointer to the file name,
+and a pointer to the file variable.
+A file record is created with the specified window size
+and the file variable set to point to it.
+The file is marked as defined but not opened.
+This allows
+.B program
+statement association of file names with file variables
+before their use by a
+.SM RESET
+or a
+.SM REWRITE .
+.SH
+BUFF s
+.IP
+The sub-opcode is placed in the external variable
+.I _bufopt
+to specify the amount of I/O buffering that is desired.
+The current options are:
+.DS
+0 \- character at a time buffering
+1 \- line at a time buffering
+2 \- block buffering
+.DE
+The default value is 1.
+.SH
+RESET
+.br
+REWRITE
+.IP
+Four data items are passed on the stack;
+the size of the data type associated with the file,
+the maximum size of the name (possibly zero),
+a pointer to the file name (possibly null),
+and a pointer to the file variable.
+If the file has never existed it is created as in
+.SM DEFNAME .
+If no file name is specified and no previous name exists
+(for example one created by
+.SM DEFNAME
+) then a system temporary name is created.
+.SM RESET
+then opens the file for input, while
+.SM REWRITE
+opens the file for output.
+.sp 1
+.PP
+The three remaining file operations are
+.SM FLUSH
+that flushes the active file,
+.SM REMOVE
+that takes the pointer to a file name and removes the
+specified file, and
+.SM MESSAGE
+that flushes all the output files and sets the
+standard error file to be the active file.
diff --git a/share/doc/papers/px/pxin4.n b/share/doc/papers/px/pxin4.n
new file mode 100644
index 000000000000..a4ee4b4a2307
--- /dev/null
+++ b/share/doc/papers/px/pxin4.n
@@ -0,0 +1,67 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)pxin4.n 5.2 (Berkeley) 4/17/91
+.\"
+.if !\n(xx .so tmac.p
+.nr H1 3
+.if n .ND
+.NH
+Conclusions
+.PP
+It is appropriate to consider,
+given the amount of time invested in rewriting the interpreter,
+whether the time was well spent, or whether a code-generator
+could have been written with an equivalent amount of effort.
+The Berkeley Pascal system is being modified to interface
+to the code generator of the portable C compiler with
+not much more work than was involved in rewritting
+.I px .
+However this compiler will probably not supercede the interpreter
+in an instructional environment as the
+necessary loading and assembly processes will slow the
+compilation process to a noticeable degree.
+This effect will be further exaggerated because
+student users spend more time in compilation than in execution.
+Measurements over the course of a quarter at Berkeley with a mixture
+of students from beginning programming to upper division compiler
+construction show that the amount of time in compilation exceeds the amount
+of time spent in the interpreter, the ratio being approximately 60/40.
+.PP
+A more promising approach might have been a throw-away code generator
+such as was done for the
+.SM
+WATFIV
+.NL
+system.
+However the addition of high-quality post-mortem and interactive
+debugging facilities become much more difficult to provide than
+in the interpreter environment.
diff --git a/share/doc/papers/px/table2.1.n b/share/doc/papers/px/table2.1.n
new file mode 100644
index 000000000000..9f142d98cc8b
--- /dev/null
+++ b/share/doc/papers/px/table2.1.n
@@ -0,0 +1,83 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)table2.1.n 5.2 (Berkeley) 4/17/91
+.\"
+.DS L
+.TS
+box center;
+c s s
+c s s
+c c c
+n ap-2 a.
+Table 2.1 \- Operator Suffixes
+=
+.sp
+Unary operator suffixes
+.sp .1i
+Suffix Example Argument type
+2 NEG2 Short integer (2 bytes)
+4 SQR4 Long integer (4 bytes)
+8 ABS8 Real (8 bytes)
+.sp
+_
+.sp
+.T&
+c s s
+c c c
+n ap-2 a.
+Binary operator suffixes
+.sp .1i
+Suffix Example Argument type
+2 ADD2 Two short integers
+24 MUL24 Short above long integer
+42 REL42 Long above short integer
+4 DIV4 Two long integers
+28 DVD28 Short integer above real
+48 REL48 Long integer above real
+82 SUB82 Real above short integer
+84 MUL84 Real above long integer
+8 ADD8 Two reals
+.sp
+_
+.sp
+.T&
+c s s
+c c c
+n ap-2 a.
+Other Suffixes
+.sp .1i
+Suffix Example Argument types
+T ADDT Sets
+G RELG Strings
+.sp
+.TE
+.DE
diff --git a/share/doc/papers/px/table2.2.n b/share/doc/papers/px/table2.2.n
new file mode 100644
index 000000000000..9a3f1db85b84
--- /dev/null
+++ b/share/doc/papers/px/table2.2.n
@@ -0,0 +1,85 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)table2.2.n 5.2 (Berkeley) 4/17/91
+.\"
+.DS L
+.TS
+box center;
+c s
+c | c
+ci | aw(3.25i).
+Table 2.2 \- Inline data type codes
+_
+Code Description
+=
+a T{
+.fi
+An address offset is given in the word
+following the instruction.
+T}
+_
+A T{
+An address offset is given in the four bytes following the instruction.
+T}
+_
+l T{
+An index into the display
+is given in the sub-opcode.
+T}
+_
+r T{
+A relational operator is encoded in the sub-opcode. (see section 2.3)
+T}
+_
+s T{
+A small integer is
+placed in the sub-opcode, or in the next word
+if it is zero or too large.
+T}
+_
+v T{
+Variable length inline data.
+T}
+_
+w T{
+A word value in the following word.
+T}
+_
+W T{
+A long value in the following four bytes.
+T}
+_
+" T{
+An inline constant string.
+T}
+.TE
+.DE
diff --git a/share/doc/papers/px/table2.3.n b/share/doc/papers/px/table2.3.n
new file mode 100644
index 000000000000..51796aef9a04
--- /dev/null
+++ b/share/doc/papers/px/table2.3.n
@@ -0,0 +1,45 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)table2.3.n 5.2 (Berkeley) 4/17/91
+.\"
+.TS H
+box center;
+c s s
+lw(14) | lw(12) | lw(40)
+lp-2 | a | l.
+Table 2.3 \- Machine operations
+_
+Mnemonic Reference Description
+=
+.TH
+.so fig2.3.n
+.TE
diff --git a/share/doc/papers/px/table3.1.n b/share/doc/papers/px/table3.1.n
new file mode 100644
index 000000000000..26db82ec0a1e
--- /dev/null
+++ b/share/doc/papers/px/table3.1.n
@@ -0,0 +1,47 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)table3.1.n 5.2 (Berkeley) 4/17/91
+.\"
+.TS
+center;
+n l l.
+\-108 FNAME Text name of associated UNIX file
+\-30 LCOUNT Current count of lines output
+\-26 LLIMIT Maximum number of lines permitted
+\-22 FBUF UNIX FILE pointer
+\-18 FCHAIN Chain to next file
+\-14 FLEV Pointer to associated file variable
+\-10 PFNAME Pointer to name of file for error messages
+\-6 FUNIT File status flags
+\-4 FSIZE Size of elements in the file
+0 File window element
+.TE
diff --git a/share/doc/papers/px/tmac.p b/share/doc/papers/px/tmac.p
new file mode 100644
index 000000000000..b6f381ab0ba9
--- /dev/null
+++ b/share/doc/papers/px/tmac.p
@@ -0,0 +1,113 @@
+.\" Copyright (c) 1979 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)tmac.p 5.2 (Berkeley) 4/17/91
+.\"
+'if \n(FM=0 'so /usr/lib/tmac/tmac.s
+.if n .nr FM 1.2i
+.if t .tr *\(**=\(eq/\(sl+\(pl
+.bd S B 3
+.de mD
+.ta 8n 17n 42n
+..
+.de SM
+.if "\\$1"" .ps -2
+.if !"\\$1"" \s-2\\$1\s0\\$2
+..
+.de LG
+.if "\\$1"" .ps +2
+.if !"\\$1"" \s+2\\$a\s0\\$2
+..
+.de HP
+.nr pd \\n(PD
+.nr PD 0
+.if \\n(.$=0 .IP
+.if \\n(.$=1 .IP "\\$1"
+.if \\n(.$>=2 .IP "\\$1" "\\$2"
+.nr PD \\n(pd
+..
+.de ZP
+.nr pd \\n(PD
+.nr PD 0
+.PP
+.nr PD \\n(pd
+..
+.de LS \"LS - Literal display; ASCII DS
+.if \\n(.$=0 .DS
+.if \\n(.$=1 \\$1
+.if \\n(.$>1 \\$1 "\\$2"
+.if t .tr '\'`\`^\(ua-\(mi
+.if t .tr _\(ul
+..
+.de LE \"LE - End literal display
+.DE
+.tr ''``__--^^
+..
+.de UP
+Berkeley Pascal\\$1
+..
+.de PD
+\s-2PDP\s0
+.if \\n(.$=0 11/70
+.if \\n(.$>0 11/\\$1
+..
+.de DK
+Digital Equipment Corporation\\$1
+..
+.de PI
+.I pi \\$1
+..
+.de Xp
+.I Pxp \\$1
+..
+.de XP
+.I pxp \\$1
+..
+.de IX
+.I pix \\$1
+..
+.de X
+.I px \\$1
+..
+.de PX
+.I px \\$1
+..
+.if n .ds dg +
+.if t .ds dg \(dg
+.if n .ds Dg \*(dg
+.if t .ds Dg \v'-0.3m'\s-2\*(dg\s0\v'0.3m'
+.if n .ds dd *
+.if t .ds dd \(dd
+.if n .ds Dd \*(dd
+.if t .ds Dd \v'-0.3m'\s-2\*(dd\s0\v'0.3m'
+.if n .ds b \\fI
+.if t .ds b \\fB
+.nr xx 1
diff --git a/share/doc/papers/relengr/0.t b/share/doc/papers/relengr/0.t
new file mode 100644
index 000000000000..7fb3290b2eca
--- /dev/null
+++ b/share/doc/papers/relengr/0.t
@@ -0,0 +1,91 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)0.t 5.1 (Berkeley) 4/17/91
+.\"
+.rm CM
+.nr PO 1.25i
+.ds CH "
+.ds CF "%
+.nr Fn 0 1
+.ds b3 4.3\s-1BSD\s+1
+.de KI
+.ds Lb "Fig. \\n+(Fn
+.KF
+.ce 1
+Figure \\n(Fn - \\$1.
+..
+.de SM
+\\s-1\\$1\\s+1\\$2
+..
+.de NM
+\&\fI\\$1\fP\\$2
+..
+.de RN
+\&\fI\\$1\fP\^(\^)\\$2
+..
+.de PN
+\&\fB\\$1\fP\\$2
+..
+.TL
+The Release Engineering of 4.3\s-1BSD\s0
+.AU
+Marshall Kirk McKusick
+.AU
+Michael J. Karels
+.AU
+Keith Bostic
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, California 94720
+.AB
+This paper describes an approach used by a small group of people
+to develop and integrate a large software system.
+It details the development and release engineering strategy
+used during the preparation of the \*(b3 version of the UNIX\(dg
+.FS
+\(dgUNIX is a registered trademark of AT&T in the US and other countries.
+.FE
+operating system.
+Each release cycle is divided into an initial development phase
+followed by a release engineering phase.
+The release engineering of the distribution is done in three steps.
+The first step has an informal control policy for tracking modifications;
+it results in an alpha distribution.
+The second step has more rigid change mechanisms in place;
+it results in a beta release.
+During the final step changes are tracked very closely;
+the result is the final distribution.
+.AE
+.LP
diff --git a/share/doc/papers/relengr/1.t b/share/doc/papers/relengr/1.t
new file mode 100644
index 000000000000..6fbe287825d5
--- /dev/null
+++ b/share/doc/papers/relengr/1.t
@@ -0,0 +1,69 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)1.t 5.1 (Berkeley) 4/17/91
+.\"
+.NH
+Introduction
+.PP
+The Computer Systems Research Group (\c
+.SM CSRG )
+has always been a small group of software developers.
+This resource limitation requires careful software-engineering management
+as well as careful coordination of both
+.SM CSRG
+personnel and the members of the general community who
+contribute to the development of the system.
+.PP
+Releases from Berkeley alternate between those that introduce
+major new facilities and those that provide bug fixes and efficiency
+improvements.
+This alternation allows timely releases, while providing for refinement,
+tuning, and correction of the new facilities.
+The timely followup of ``cleanup'' releases reflects the importance
+.SM CSRG
+places on providing a reliable and robust system on which its
+user community can depend.
+.PP
+The development of the Berkeley Software Distribution (\c
+.SM BSD )
+illustrates an \fIadvantage\fP of having a few
+principal developers:
+the developers all understand the entire system thoroughly enough
+to be able to coordinate their own work with
+that of other people to produce a coherent final system.
+Companies with large development organizations find
+this result difficult to duplicate.
+This paper describes the process by which
+the development effort for \*(b3 was managed.
+.[
+design and implementation
+.]
diff --git a/share/doc/papers/relengr/2.t b/share/doc/papers/relengr/2.t
new file mode 100644
index 000000000000..0c3ce8c80bcc
--- /dev/null
+++ b/share/doc/papers/relengr/2.t
@@ -0,0 +1,146 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)2.t 5.1 (Berkeley) 4/17/91
+.\"
+.NH
+System Development
+.PP
+The first phase of each Berkeley system is its development.
+.SM CSRG
+maintains a continuously evolving list of projects that are candidates
+for integration into the system.
+Some of these are prompted by emerging ideas from the research world,
+such as the availability of a new technology, while other additions
+are suggested by the commercial world, such as the introduction of
+new standards like
+.SM POSIX ,
+and still other projects are emergency responses to situations like
+the Internet Worm.
+.PP
+These projects are ordered based on the perceived benefit of the
+project as opposed to its difficulty;
+the most important are selected for inclusion in each new release.
+Often there is a prototype available from a group outside
+.SM CSRG .
+Because of the limited staff at
+.SM CSRG ,
+this prototype is obtained to use as a starting base
+for integration into the
+.SM BSD
+system.
+Only if no prototype is available is the project begun in-house.
+In either case, the design of the facility is forced to conform to the
+.SM CSRG
+style.
+.PP
+Unlike other development groups, the staff of
+.SM CSRG
+specializes by projects rather than by particular parts
+of the system;
+a staff person will be responsible for all aspects of a project.
+This responsibility starts at the associated kernel device drivers;
+it proceeds up through the rest of the kernel,
+through the C library and system utility programs,
+ending at the user application layer.
+This staff person is also responsible for related documentation,
+including manual pages.
+Many projects proceed in parallel,
+interacting with other projects as their paths cross.
+.PP
+All source code, documentation, and auxiliary files are kept
+under a source code control system.
+During development,
+this control system is critical for notifying people
+when they are colliding with other ongoing projects.
+Even more important, however,
+is the audit trail maintained by the control system that
+is critical to the release engineering phase of the project
+described in the next section.
+.PP
+Much of the development of
+.SM BSD
+is done by personnel that are located at other institutions.
+Many of these people not only have interim copies of the release
+running on their own machines,
+but also have user accounts on the main development
+machine at Berkeley.
+Such users are commonly found logged in at Berkeley over the
+Internet, or sometimes via telephone dialup, from places as far away
+as Massachusetts or Maryland, as well as from closer places, such as
+Stanford.
+For the \*(b3 release,
+certain users had permission to modify the master copy of the
+system source directly.
+People given access to the master sources
+are carefully screened beforehand,
+but are not closely supervised.
+Their work is checked at the end of the beta-test period by
+.SM CSRG
+personnel who back out inappropriate changes.
+Several facilities, including the
+Fortran and C compilers,
+as well as important system programs, for example,
+.PN telnet
+and
+.PN ftp ,
+include significant contributions from people who did not work
+directly for
+.SM CSRG .
+One important exception to this approach is that changes to the kernel
+are made only by
+.SM CSRG
+personnel, although the changes are often suggested by the larger community.
+.PP
+The development phase continues until
+.SM CSRG
+decides that it is appropriate to make a release.
+The decision to halt development and transition to release mode
+is driven by several factors.
+The most important is that enough projects have been completed
+to make the system significantly superior to the previously released
+version of the system.
+For example,
+\*(b3 was released primarily because of the need for
+the improved networking capabilities and the markedly
+improved system performance.
+Of secondary importance is the issue of timing.
+If the releases are too infrequent, then
+.SM CSRG
+will be inundated with requests for interim releases.
+Conversely,
+if systems are released too frequently,
+the integration cost for many vendors will be too high,
+causing them to ignore the releases.
+Finally,
+the process of release engineering is long and tedious.
+Frequent releases slow the rate of development and
+cause undue tedium to the staff.
diff --git a/share/doc/papers/relengr/3.t b/share/doc/papers/relengr/3.t
new file mode 100644
index 000000000000..8d89ded0c0ce
--- /dev/null
+++ b/share/doc/papers/relengr/3.t
@@ -0,0 +1,390 @@
+.\" Copyright (c) 1989 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)3.t 5.1 (Berkeley) 4/17/91
+.\"
+.NH
+System Release
+.PP
+Once the decision has been made to halt development
+and begin release engineering,
+all currently unfinished projects are evaluated.
+This evaluation involves computing the time required to complete
+the project as opposed to how important the project is to the
+upcoming release.
+Projects that are not selected for completion are
+removed from the distribution branch of the source code control system
+and saved on branch deltas so they can be retrieved,
+completed, and merged into a future release;
+the remaining unfinished projects are brought to orderly completion.
+.PP
+Developments from
+.SM CSRG
+are released in three steps: alpha, beta, and final.
+Alpha and beta releases are not true distributions\(emthey
+are test systems.
+Alpha releases are normally available to only a few sites,
+usually those working closely with
+.SM CSRG .
+More sites are given beta releases,
+as the system is closer to completion,
+and needs wider testing to find more obscure problems.
+For example, \*(b3 alpha was distributed to about fifteen
+sites, while \*(b3 beta ran at more than a hundred.
+.NH 2
+Alpha Distribution Development
+.PP
+The first step in creating an alpha distribution is to evaluate the
+existing state of the system and to decide what software should be
+included in the release.
+This decision process includes not only deciding what software should
+be added, but also what obsolete software ought to be retired from the
+distribution.
+The new software includes the successful projects that have been
+completed at
+.SM CSRG
+and elsewhere, as well as some portion of the vast quantity of
+contributed software that has been offered during the development
+period.
+.PP
+Once an initial list has been created,
+a prototype filesystem corresponding to the distribution
+is constructed, typically named
+.PN /nbsd .
+This prototype will eventually turn into the master source tree for the
+final distribution.
+During the period that the alpha distribution is being created,
+.PN /nbsd
+is mounted read-write, and is highly fluid.
+Programs are created and deleted,
+old versions of programs are completely replaced,
+and the correspondence between the sources and binaries
+is only loosely tracked.
+People outside
+.SM CSRG
+who are helping with the distribution are free to
+change their parts of the distribution at will.
+.PP
+During this period the newly forming distribution is
+checked for interoperability.
+For example,
+in \*(b3 the output of context differences from
+.PN diff
+was changed to merge overlapping sections.
+Unfortunately, this change broke the
+.PN patch
+program which could no longer interpret the output of
+.PN diff .
+Since the change to
+.PN diff
+and the
+.PN patch
+program had originated outside Berkeley,
+.SM CSRG
+had to coordinate the efforts of the respective authors
+to make the programs work together harmoniously.
+.PP
+Once the sources have stabilized,
+an attempt is made to compile the entire source tree.
+Often this exposes errors caused by changed header files,
+or use of obsoleted C library interfaces.
+If the incompatibilities affect too many programs,
+or require excessive amounts of change in the programs
+that are affected,
+the incompatibility is backed out or some backward-compatible
+interface is provided.
+The incompatibilities that are found and left in are noted
+in a list that is later incorporated into the release notes.
+Thus, users upgrading to the new system can anticipate problems
+in their own software that will require change.
+.PP
+Once the source tree compiles completely,
+it is installed and becomes the running system that
+.SM CSRG
+uses on its main development machine.
+Once in day-to-day use,
+other interoperability problems become apparent
+and are resolved.
+When all known problems have been resolved, and the system has been
+stable for some period of time, an alpha distribution tape is made
+from the contents of
+.PN /nbsd .
+.PP
+The alpha distribution is sent out to a small set of test sites.
+These test sites are selected as having a
+sophisticated user population, not only capable of finding bugs,
+but also of determining their cause and developing a fix for the problem.
+These sites are usually composed of groups that are contributing
+software to the distribution or groups that have a particular expertise
+with some portion of the system.
+.NH 2
+Beta Distribution Development
+.PP
+After the alpha tape is created,
+the distribution filesystem is mounted read-only.
+Further changes are requested in a change log rather than
+being made directly to the distribution.
+The change requests are inspected and implemented by a
+.SM CSRG
+staff person, followed by a compilation of the affected
+programs to ensure that they still build correctly.
+Once the alpha tape has been cut,
+changes to the distribution are no longer made by people outside
+.SM CSRG .
+.PP
+As the alpha sites install and begin running the alpha distribution,
+they monitor the problems that they encounter.
+For minor bugs, they typically report back the bug along with
+a suggested fix.
+Since many of the alpha sites are selected from among the people
+working closely with
+.SM CSRG ,
+they often have accounts on, and access to, the primary
+.SM CSRG
+development machine.
+Thus, they are able to directly install the fix themselves,
+and simply notify
+.SM CSRG
+when they have fixed the problem.
+After verifying the fix, the affected files are added to
+the list to be updated on
+.PN /nbsd .
+.PP
+The more important task of the alpha sites is to test out the
+new facilities that have been added to the system.
+The alpha sites often find major design flaws
+or operational shortcomings of the facilities.
+When such problems are found,
+the person in charge of that facility is responsible
+for resolving the problem.
+Occasionally this requires redesigning and reimplementing
+parts of the affected facility.
+For example,
+in 4.2\s-1BSD\s+1,
+the alpha release of the networking system did not have connection queueing.
+This shortcoming prevented the network from handling many
+connections to a single server.
+The result was that the networking interface had to be
+redesigned to provide this functionality.
+.PP
+The alpha sites are also responsible for ferreting out interoperability
+problems between different utilities.
+The user populations of the test sites differ from the user population at
+.SM CSRG ,
+and, as a result, the utilities are exercised in ways that differ
+from the ways that they are used at
+.SM CSRG .
+These differences in usage patterns turn up problems that
+do not occur in our initial test environment.
+.PP
+The alpha sites frequently redistribute the alpha tape to several
+of their own alpha sites that are particularly interested
+in parts of the new system.
+These additional sites are responsible for reporting
+problems back to the site from which they received the distribution,
+not to
+.SM CSRG .
+Often these redistribution sites are less sophisticated than the
+direct alpha sites, so their reports need to be filtered
+to avoid spurious, or site dependent, bug reports.
+The direct alpha sites sift through the reports to find those that
+are relevant, and usually verify the suggested fix if one is given,
+or develop a fix if none is provided.
+This hierarchical testing process forces
+bug reports, fixes, and new software
+to be collected, evaluated, and checked for inaccuracies
+by first-level sites before being forwarded to
+.SM CSRG ,
+allowing the developers at
+.SM CSRG
+to concentrate on tracking the changes being made to the system
+rather than sifting through information (often voluminous) from every
+alpha-test site.
+.PP
+Once the major problems have been attended to,
+the focus turns to getting the documentation synchronized
+with the code that is being shipped.
+The manual pages need to be checked to be sure that
+they accurately reflect any changes to the programs that
+they describe.
+Usually the manual pages are kept up to date as
+the program they describe evolves.
+However, the supporting documents frequently do not get changed,
+and must be edited to bring them up to date.
+During this review, the need for other documents becomes evident.
+For example, it was
+during this phase of \*(b3 that it was decided
+to add a tutorial document on how to use the socket
+interprocess communication primitives.
+.PP
+Another task during this period is to contact the people that
+have contributed complete software packages
+(such as
+.PN RCS
+or
+.PN MH )
+in previous releases to see if they wish to
+make any revisions to their software.
+For those who do,
+the new software has to be obtained,
+and tested to verify that it compiles and runs
+correctly on the system to be released.
+Again, this integration and testing can often be done by the
+contributors themselves by logging directly into the master machine.
+.PP
+After the stream of bug reports has slowed down
+to a reasonable level,
+.SM CSRG
+begins a careful review of all the changes to the
+system since the previous release.
+The review is done by running a recursive
+.PN diff
+of the entire source tree\(emhere, of
+.PN /nbsd
+with 4.2\s-1BSD\s+1.
+All the changes are checked to ensure that they are reasonable,
+and have been properly documented.
+The process often turns up questionable changes.
+When such a questionable change is found,
+the source code control system log is examined to find
+out who made the change and what their explanation was
+for the change.
+If the log does not resolve the problem,
+the person responsible for the change is asked for an explanation
+of what they were trying to accomplish.
+If the reason is not compelling,
+the change is backed out.
+Facilities deemed inappropriate in \*(b3 included new options to
+the directory-listing command and a changed return value for the
+.RN fseek
+library routine;
+the changes were removed from the source before final distribution.
+Although this process is long and tedious,
+it forces the developers to obtain a coherent picture of the entire set of
+changes to the system.
+This exercise often turns up inconsistencies that would
+otherwise never be found.
+.PP
+The outcome of the comparison results in
+a pair of documents detailing
+changes to every user-level command
+.[
+Bug Fixes and Changes
+.]
+and to every kernel source file.
+.[
+Changes to the Kernel
+.]
+These documents are delivered with the final distribution.
+A user can look up any command by name and see immediately
+what has changed,
+and a developer can similarly look up any kernel
+file by name and get a summary of the changes to that file.
+.PP
+Having completed the review of the entire system,
+the preparation of the beta distribution is started.
+Unlike the alpha distribution, where pieces of the system
+may be unfinished and the documentation incomplete,
+the beta distribution is put together as if it were
+going to be the final distribution.
+All known problems are fixed, and any remaining development
+is completed.
+Once the beta tape has been prepared,
+no further changes are permitted to
+.PN /nbsd
+without careful review,
+as spurious changes made after the system has been
+.PN diff ed
+are unlikely to be caught.
+.NH 2
+Final Distribution Development
+.PP
+The beta distribution goes to more sites than the
+alpha distribution for three main reasons.
+First, as it is closer to the final release, more sites are willing
+to run it in a production environment without fear of catastrophic failures.
+Second, more commercial sites delivering
+.SM BSD -\c
+derived systems are interested in getting a preview of the
+upcoming changes in preparation for merging them into their
+own systems.
+Finally, because the beta tape has fewer problems,
+it is beneficial to offer it to more sites in hopes of
+finding as many of the remaining problems as possible.
+Also, by handing the system out to less sophisticated sites,
+issues that would be ignored by the users of the alpha sites
+become apparent.
+.PP
+The anticipation is that the beta tape will not require
+extensive changes to either the programs or the documentation.
+Most of the work involves sifting through the reported bugs
+to find those that are relevant and devising the minimal
+reasonable set of changes to fix them.
+After throughly testing the fix, it is listed in the update log for
+.PN /nbsd .
+One person at
+.SM CSRG
+is responsible for doing the update of
+.PN /nbsd
+and ensuring that everything affected by the change is rebuilt and tested.
+Thus, a change to a C library routine requires that the entire
+system be rebuilt.
+.PP
+During this period, the documentation is all printed and proofread.
+As minor changes are made to the manual pages and documentation,
+the affected pages must be reprinted.
+.PP
+The final step in the release process is to check the distribution tree
+to ensure that it is in a consistent state.
+This step includes verification that every file and directory
+on the distribution has the proper owner, group, and modes.
+All source files must be checked to be sure that they have
+appropriate copyright notices and source code control system headers.
+Any extraneous files must be removed.
+Finally, the installed binaries must be checked to ensure that they correspond
+exactly to the sources and libraries that are on the distribution.
+.PP
+This checking is a formidable task given that there are over 20,000 files on
+a typical distribution.
+Much of the checking can be done by a set of programs set to scan
+over the distribution tree.
+Unfortunately, the exception list is long, and requires
+hours of tedious hand checking; this has caused
+.SM CSRG
+to develop even
+more comprehensive validation programs for use in our next release.
+.PP
+Once the final set of checks has been run,
+the master tape can be made, and the official distribution started.
+As for the staff of
+.SM CSRG ,
+we usually take a brief vacation before plunging back into
+a new development phase.
diff --git a/share/doc/papers/relengr/Makefile b/share/doc/papers/relengr/Makefile
new file mode 100644
index 000000000000..506fa7a81fc7
--- /dev/null
+++ b/share/doc/papers/relengr/Makefile
@@ -0,0 +1,12 @@
+# @(#)Makefile 1.6 (Berkeley) 6/8/93
+
+DIR= papers/relengr
+SRCS= 0.t 1.t 2.t 3.t
+MACROS= -ms
+EXTRA= ref.bib tmac.srefs
+REFER= /a/staff/mckusick/book/ref/refer -m -n -e -l -s -p ref.bib
+
+paper.ps: ${SRCS}
+ ${REFER} ${SRCS} | ${ROFF} > ${.TARGET}
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/relengr/ref.bib b/share/doc/papers/relengr/ref.bib
new file mode 100644
index 000000000000..6f33cd7e9dd4
--- /dev/null
+++ b/share/doc/papers/relengr/ref.bib
@@ -0,0 +1,26 @@
+%A M. K. McKusick
+%A J. M. Bloom
+%A M. J. Karels
+%T Bug Fixes and Changes in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 12:1\-22
+%D 1986
+
+%A M. J. Karels
+%T Changes to the Kernel in 4.3BSD
+%B \s-1UNIX\s0 System Manager's Manual, 4.3 Berkeley Software Distribution, Virtual VAX-11 Version
+%I \s-1USENIX\s0 Association
+%C Berkeley, CA
+%P 13:1\-32
+%D 1986
+
+%A S. J. Leffler
+%A M. K. McKusick
+%A M. J. Karels
+%A J. S. Quarterman
+%T The Design and Implementation of the 4.3BSD UNIX Operating System
+%I Addison-Wesley
+%C Reading, MA
+%D 1989
diff --git a/share/doc/papers/relengr/ref.bib.ig b/share/doc/papers/relengr/ref.bib.ig
new file mode 100644
index 000000000000..fb24c6ea0c9f
--- /dev/null
+++ b/share/doc/papers/relengr/ref.bib.ig
@@ -0,0 +1,3 @@
+ref.bib:0,249 mckusi bloom karels bug fixes change system manage manual berkel softwa distri virtua vax versio associ berkel 1986
+ref.bib:249,216 karels change kernel system manage manual berkel softwa distri virtua vax versio associ berkel 1986
+ref.bib:465,181 leffle mckusi karels quarte design implem unix operat system addiso wesley readin 1989
diff --git a/share/doc/papers/relengr/spell.ok b/share/doc/papers/relengr/spell.ok
new file mode 100644
index 000000000000..13f5cf8b90ba
--- /dev/null
+++ b/share/doc/papers/relengr/spell.ok
@@ -0,0 +1,15 @@
+BSD
+Bostic
+CH
+CM
+CSRG
+Fn
+Karels
+Lb
+McKusick
+POSIX
+editted
+filesystem
+followup
+mothballed
+nbsd
diff --git a/share/doc/papers/relengr/tmac.srefs b/share/doc/papers/relengr/tmac.srefs
new file mode 100644
index 000000000000..889e3fe6bf5d
--- /dev/null
+++ b/share/doc/papers/relengr/tmac.srefs
@@ -0,0 +1,179 @@
+.\" @(#)tmac.srefs 1.14 11/2/88
+.\" REFER macros .... citations
+.de []
+.][ \\$1
+..
+.de ][
+.if \\$1>5 .tm Bad arg to []
+.[\\$1
+..
+.if n .ds [. [
+.\".if t .ds [. \s-2\v'-.4m'\f1
+.if t .ds [. [
+.if n .ds .] ]
+.\".if t .ds .] \v'.4m'\s+2\fP
+.if t .ds .] ]
+.ds (. \& [
+.ds .) ]
+.if n .ds [o ""
+.if n .ds [c ""
+.if t .ds [o ``
+.if t .ds [c ''
+.ds [e \\fIet al.\\fP
+.\" for author list in reference:
+.ds &1 &
+.\" for -m signal (auth1 and auth2, year):
+.ds &2 &
+.\" the next lines deal with the problem of .[1] or [1].
+.\" refer will write "linexxx\*(<.[1]\*(>.
+.\" and either "<." or ">." should produce the .;
+.\" similarly for , and ;
+.rm <. <, <;
+.if n .ds >. .
+.if t .ds >. .
+.if n .ds >, ,
+.if t .ds >, ,
+.if n .ds >; ;
+.if t .ds >; ;
+.de [5 \" tm style
+.FS
+.IP "\\*([F.\0"
+\\*([A, \\f2\\*([T\\f1,
+.ie \\n(TN \\*([M.
+.el Bell Laboratories internal memorandum (\\*([D).
+.RT
+.FE
+..
+.de [0 \" other
+.FS
+.nr [: 0
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \{.nr [: 1
+\\*([A\c\}
+.if !"\\*([T"" \{.if \\n([:>0 ,
+.nr [: 1
+\\f2\\*([T\\f1\c\}
+.if !"\\*([O""\{.if \\n([:>0 ,
+.nr [: 1
+.if \\n([O>0 .nr [: 0
+\\*([O\c
+.if \\n([O>0 \& \c\}
+.ie !"\\*([D"" \{.if \\n([:>0 ,
+.nr [: 1
+\\*([D\c\}
+.if \\n([:>0 \&.
+.RT
+.FE
+..
+.de [1 \" journal article
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\*([o\\*([T,\\*([c
+\\f2\\*([J\\f1\c
+.if !"\\*([V"" .if n \& Vol.\&\c
+.if !"\\*([V"" \& \\f3\\*([V\\f1\c
+.if !"\\*([N"" (\\*([N)\c
+.if !"\\*([P"" \{\
+.ie \\n([P>0 , pp. \c
+.el , p. \c
+\\*([P\c\}
+.if !"\\*([I"" .if "\\*([R"" , \\*([I\c
+.if !"\\*([O"" .if \\n([O=0 , \\*([O\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" .if \\n([O>0 \\*([O
+.RT
+.FE
+..
+.de [2 \" book
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\f2\\*([T,\\f1
+\\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([G"" Gov't. ordering no. \\*([G.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de [4 \" report
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+\\*([A, \\*([o\\*([T,\\*([c
+\\*([R\c
+.if !"\\*([G"" \& (\\*([G)\c
+.if !"\\*([I"" , \\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de [3 \" article in book
+.FS
+.if !"\\*([F"" .IP "\\*([F.\0"
+.if !"\\*([A"" \\*([A,
+.if !"\\*([T"" \\*([o\\*([T,\\*([c
+.if !"\\*([P"" pp. \\*([P
+in \\f2\\*([B\\f1\c
+.if !"\\*([E"" , ed. \\*([E\c
+.if !"\\*([I"" , \\*([I\c
+.if !"\\*([C"" , \\*([C\c
+.if !"\\*([D"" \& (\\*([D)\c
+\&.
+.if !"\\*([O"" \\*([O
+.RT
+.FE
+..
+.de ]<
+.[<
+..
+.de [<
+.RT
+.ne 62p
+.ie \\n(rS \{\
+. rs
+. sp 4p
+.\}
+.el .sp 27p
+.po -2.5P
+.Li 2 30.5P
+\\s11\fBReferences\fP\s10
+.br
+.if \\n(Ns<2 \{\
+. nr Ns 1
+. ds ST References
+.\}
+.\"nr Tt 7
+.po
+.sp 8p
+.rm FS FE
+.\"sy echo '.T3 "\\\\t\\\\tReferences" \\n%' >>Toc
+.ns
+..
+.de [>
+.]>
+..
+.de ]>
+.sp
+..
+.de ]-
+.[-
+..
+.de [-
+.rm [V [P [A [T
+.rm [N [C [B [O
+.rm [R [I [E [D
+..
+.de ]]
+this is never
+executed
+and just
+uses up an end-of-file
+bug.
+..
diff --git a/share/doc/papers/sysperf/0.t b/share/doc/papers/sysperf/0.t
new file mode 100644
index 000000000000..0c27a344cf3a
--- /dev/null
+++ b/share/doc/papers/sysperf/0.t
@@ -0,0 +1,247 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)0.t 5.1 (Berkeley) 4/17/91
+.\"
+.if n .ND
+.TL
+Measuring and Improving the Performance of Berkeley UNIX*
+.sp
+April 17, 1991
+.AU
+Marshall Kirk McKusick,
+Samuel J. Leffler\(dg,
+Michael J. Karels
+.AI
+Computer Systems Research Group
+Computer Science Division
+Department of Electrical Engineering and Computer Science
+University of California, Berkeley
+Berkeley, CA 94720
+.AB
+.FS
+* UNIX is a trademark of AT&T Bell Laboratories.
+.FE
+.FS
+\(dg Samuel J. Leffler is currently employed by:
+Silicon Graphics, Inc.
+.FE
+.FS
+This work was done under grants from
+the National Science Foundation under grant MCS80-05144,
+and the Defense Advance Research Projects Agency (DoD) under
+ARPA Order No. 4031 monitored by Naval Electronic System Command under
+Contract No. N00039-82-C-0235.
+.FE
+The 4.2 Berkeley Software Distribution of
+.UX
+for the VAX\(dd
+.FS
+\(dd VAX, MASSBUS, UNIBUS, and DEC are trademarks of
+Digital Equipment Corporation.
+.FE
+had several problems that could severely affect the overall
+performance of the system.
+These problems were identified with
+kernel profiling and system tracing during day to day use.
+Once potential problem areas had been identified
+benchmark programs were devised to highlight the bottlenecks.
+These benchmarks verified that the problems existed and provided
+a metric against which to validate proposed solutions.
+This paper examines
+the performance problems encountered and describes
+modifications that have been made
+to the system since the initial distribution.
+.PP
+The changes to the system have consisted of improvements to the
+performance of the existing facilities,
+as well as enhancements to the current facilities.
+Performance improvements in the kernel include cacheing of path name
+translations, reductions in clock handling and scheduling overhead,
+and improved throughput of the network subsystem.
+Performance improvements in the libraries and utilities include replacement of
+linear searches of system databases with indexed lookup,
+merging of most network services into a single daemon,
+and conversion of system utilities to use the more efficient
+facilities available in 4.2BSD.
+Enhancements in the kernel include the addition of subnets and gateways,
+increases in many kernel limits,
+cleanup of the signal and autoconfiguration implementations,
+and support for windows and system logging.
+Functional extensions in the libraries and utilities include
+the addition of an Internet name server,
+new system management tools,
+and extensions to \fIdbx\fP to work with Pascal.
+The paper concludes with a brief discussion of changes made to
+the system to enhance security.
+All of these enhancements are present in Berkeley UNIX 4.3BSD.
+.AE
+.LP
+.sp 2
+CR Categories and Subject Descriptors:
+D.4.3
+.B "[Operating Systems]":
+File Systems Management \-
+.I "file organization, directory structures, access methods";
+D.4.8
+.B "[Operating Systems]":
+Performance \-
+.I "measurements, operational analysis";
+.sp
+Additional Keywords and Phrases:
+Berkeley UNIX,
+system performance,
+application program interface.
+.sp
+General Terms:
+UNIX operating system,
+measurement,
+performance.
+.de PT
+.lt \\n(LLu
+.pc %
+.nr PN \\n%
+.tl '\\*(LH'\\*(CH'\\*(RH'
+.lt \\n(.lu
+..
+.af PN i
+.ds LH Performance
+.ds RH Contents
+.bp 1
+.if t .ds CF April 17, 1991
+.if t .ds LF DRAFT
+.if t .ds RF McKusick, et. al.
+.ce
+.B "TABLE OF CONTENTS"
+.LP
+.sp 1
+.nf
+.B "1. Introduction"
+.LP
+.sp .5v
+.nf
+.B "2. Observation techniques
+\0.1. System maintenance tools
+\0.2. Kernel profiling
+\0.3. Kernel tracing
+\0.4. Benchmark programs
+.LP
+.sp .5v
+.nf
+.B "3. Results of our observations
+\0.1. User programs
+\0.1.1. Mail system
+\0.1.2. Network servers
+\0.2. System overhead
+\0.2.1. Micro-operation benchmarks
+\0.2.2. Path name translation
+\0.2.3. Clock processing
+\0.2.4. Terminal multiplexors
+\0.2.5. Process table management
+\0.2.6. File system buffer cache
+\0.2.7. Network subsystem
+\0.2.8. Virtual memory subsystem
+.LP
+.sp .5v
+.nf
+.B "4. Performance Improvements
+\0.1. Performance Improvements in the Kernel
+\0.1.1. Name Cacheing
+\0.1.2. Intelligent Auto Siloing
+\0.1.3. Process Table Management
+\0.1.4. Scheduling
+\0.1.5. Clock Handling
+\0.1.6. File System
+\0.1.7. Network
+\0.1.8. Exec
+\0.1.9. Context Switching
+\0.1.10. Setjmp and Longjmp
+\0.1.11. Compensating for Lack of Compiler Technology
+\0.2. Improvements to Libraries and Utilities
+\0.2.1. Hashed Databases
+\0.2.2. Buffered I/O
+\0.2.3. Mail System
+\0.2.4. Network Servers
+\0.2.5. The C Run-time Library
+\0.2.6. Csh
+.LP
+.sp .5v
+.nf
+.B "5. Functional Extensions
+\0.1. Kernel Extensions
+\0.1.1. Subnets, Broadcasts, and Gateways
+\0.1.2. Interface Addressing
+\0.1.3. User Control of Network Buffering
+\0.1.4. Number of File Descriptors
+\0.1.5. Kernel Limits
+\0.1.6. Memory Management
+\0.1.7. Signals
+\0.1.8. System Logging
+\0.1.9. Windows
+\0.1.10. Configuration of UNIBUS Devices
+\0.1.11. Disk Recovery from Errors
+\0.2. Functional Extensions to Libraries and Utilities
+\0.2.1. Name Server
+\0.2.2. System Management
+\0.2.3. Routing
+\0.2.4. Compilers
+.LP
+.sp .5v
+.nf
+.B "6. Security Tightening
+\0.1. Generic Kernel
+\0.2. Security Problems in Utilities
+.LP
+.sp .5v
+.nf
+.B "7. Conclusions
+.LP
+.sp .5v
+.nf
+.B Acknowledgements
+.LP
+.sp .5v
+.nf
+.B References
+.LP
+.sp .5v
+.nf
+.B "Appendix \- Benchmark Programs"
+.de _d
+.if t .ta .6i 2.1i 2.6i
+.\" 2.94 went to 2.6, 3.64 to 3.30
+.if n .ta .84i 2.6i 3.30i
+..
+.de _f
+.if t .ta .5i 1.25i 2.5i
+.\" 3.5i went to 3.8i
+.if n .ta .7i 1.75i 3.8i
+..
diff --git a/share/doc/papers/sysperf/1.t b/share/doc/papers/sysperf/1.t
new file mode 100644
index 000000000000..88608eef4f0a
--- /dev/null
+++ b/share/doc/papers/sysperf/1.t
@@ -0,0 +1,81 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)1.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Introduction
+.af PN 1
+.bp 1
+.NH
+Introduction
+.PP
+The Berkeley Software Distributions of
+.UX
+for the VAX have added many new capabilities that were
+previously unavailable under
+.UX .
+The development effort for 4.2BSD concentrated on providing new
+facilities, and in getting them to work correctly.
+Many new data structures were added to the system to support
+these new capabilities.
+In addition,
+many of the existing data structures and algorithms
+were put to new uses or their old functions placed under increased demand.
+The effect of these changes was that
+mechanisms that were well tuned under 4.1BSD
+no longer provided adequate performance for 4.2BSD.
+The increased user feedback that came with the release of
+4.2BSD and a growing body of experience with the system
+highlighted the performance shortcomings of 4.2BSD.
+.PP
+This paper details the work that we have done since
+the release of 4.2BSD to measure the performance of the system,
+detect the bottlenecks,
+and find solutions to remedy them.
+Most of our tuning has been in the context of the real
+timesharing systems in our environment.
+Rather than using simulated workloads,
+we have sought to analyze our tuning efforts under
+realistic conditions.
+Much of the work has been done in the machine independent parts
+of the system, hence these improvements could be applied to
+other variants of UNIX with equal success.
+All of the changes made have been included in 4.3BSD.
+.PP
+Section 2 of the paper describes the tools and techniques
+available to us for measuring system performance.
+In Section 3 we present the results of using these tools, while Section 4
+has the performance improvements
+that have been made to the system based on our measurements.
+Section 5 highlights the functional enhancements that have
+been made to Berkeley UNIX 4.2BSD.
+Section 6 discusses some of the security problems that
+have been addressed.
diff --git a/share/doc/papers/sysperf/2.t b/share/doc/papers/sysperf/2.t
new file mode 100644
index 000000000000..703cbb63cb46
--- /dev/null
+++ b/share/doc/papers/sysperf/2.t
@@ -0,0 +1,258 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)2.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Observation techniques
+.NH
+Observation techniques
+.PP
+There are many tools available for monitoring the performance
+of the system.
+Those that we found most useful are described below.
+.NH 2
+System maintenance tools
+.PP
+Several standard maintenance programs are invaluable in
+observing the basic actions of the system.
+The \fIvmstat\fP(1)
+program is designed to be an aid to monitoring
+systemwide activity. Together with the
+\fIps\fP\|(1)
+command (as in ``ps av''), it can be used to investigate systemwide
+virtual memory activity.
+By running \fIvmstat\fP
+when the system is active you can judge the system activity in several
+dimensions: job distribution, virtual memory load, paging and swapping
+activity, disk and cpu utilization.
+Ideally, to have a balanced system in activity,
+there should be few blocked (b) jobs,
+there should be little paging or swapping activity, there should
+be available bandwidth on the disk devices (most single arms peak
+out at 25-35 tps in practice), and the user cpu utilization (us) should
+be high (above 50%).
+.PP
+If the system is busy, then the count of active jobs may be large,
+and several of these jobs may often be blocked (b). If the virtual
+memory is active, then the paging demon will be running (sr will
+be non-zero). It is healthy for the paging demon to free pages when
+the virtual memory gets active; it is triggered by the amount of free
+memory dropping below a threshold and increases its pace as free memory
+goes to zero.
+.PP
+If you run \fIvmstat\fP
+when the system is busy (a ``vmstat 5'' gives all the
+numbers computed by the system), you can find
+imbalances by noting abnormal job distributions. If many
+processes are blocked (b), then the disk subsystem
+is overloaded or imbalanced. If you have several non-dma
+devices or open teletype lines that are ``ringing'', or user programs
+that are doing high-speed non-buffered input/output, then the system
+time may go high (60-80% or higher).
+It is often possible to pin down the cause of high system time by
+looking to see if there is excessive context switching (cs), interrupt
+activity (in) or system call activity (sy). Long term measurements
+on one of
+our large machines show
+an average of 60 context switches and interrupts
+per second and an average of 90 system calls per second.
+.PP
+If the system is heavily loaded, or if you have little memory
+for your load (1 megabyte is little in our environment), then the system
+may be forced to swap. This is likely to be accompanied by a noticeable
+reduction in the system responsiveness and long pauses when interactive
+jobs such as editors swap out.
+.PP
+A second important program is \fIiostat\fP\|(1).
+\fIIostat\fP
+iteratively reports the number of characters read and written to terminals,
+and, for each disk, the number of transfers per second, kilobytes
+transferred per second,
+and the milliseconds per average seek.
+It also gives the percentage of time the system has
+spent in user mode, in user mode running low priority (niced) processes,
+in system mode, and idling.
+.PP
+To compute this information, for each disk, seeks and data transfer completions
+and the number of words transferred are counted;
+for terminals collectively, the number
+of input and output characters are counted.
+Also, every 100 ms,
+the state of each disk is examined
+and a tally is made if the disk is active.
+From these numbers and the transfer rates
+of the devices it is possible to determine
+average seek times for each device.
+.PP
+When filesystems are poorly placed on the available
+disks, figures reported by \fIiostat\fP can be used
+to pinpoint bottlenecks. Under heavy system load, disk
+traffic should be spread out among the drives with
+higher traffic expected to the devices where the root, swap, and
+/tmp filesystems are located. When multiple disk drives are
+attached to the same controller, the system will
+attempt to overlap seek operations with I/O transfers. When
+seeks are performed, \fIiostat\fP will show
+non-zero average seek times. Most modern disk drives should
+exhibit an average seek time of 25-35 ms.
+.PP
+Terminal traffic reported by \fIiostat\fP should be heavily
+output oriented unless terminal lines are being used for
+data transfer by programs such as \fIuucp\fP. Input and
+output rates are system specific. Screen editors
+such as \fIvi\fP and \fIemacs\fP tend to exhibit output/input
+ratios of anywhere from 5/1 to 8/1. On one of our largest
+systems, 88 terminal lines plus 32 pseudo terminals, we observed
+an average of 180 characters/second input and 450 characters/second
+output over 4 days of operation.
+.NH 2
+Kernel profiling
+.PP
+It is simple to build a 4.2BSD kernel that will automatically
+collect profiling information as it operates simply by specifying the
+.B \-p
+option to \fIconfig\fP\|(8) when configuring a kernel.
+The program counter sampling can be driven by the system clock,
+or by an alternate real time clock.
+The latter is highly recommended as use of the system clock results
+in statistical anomalies in accounting for
+the time spent in the kernel clock routine.
+.PP
+Once a profiling system has been booted statistic gathering is
+handled by \fIkgmon\fP\|(8).
+\fIKgmon\fP allows profiling to be started and stopped
+and the internal state of the profiling buffers to be dumped.
+\fIKgmon\fP can also be used to reset the state of the internal
+buffers to allow multiple experiments to be run without
+rebooting the machine.
+.PP
+The profiling data is processed with \fIgprof\fP\|(1)
+to obtain information regarding the system's operation.
+Profiled systems maintain histograms of the kernel program counter,
+the number of invocations of each routine,
+and a dynamic call graph of the executing system.
+The postprocessing propagates the time spent in each
+routine along the arcs of the call graph.
+\fIGprof\fP then generates a listing for each routine in the kernel,
+sorted according to the time it uses
+including the time of its call graph descendents.
+Below each routine entry is shown its (direct) call graph children,
+and how their times are propagated to this routine.
+A similar display above the routine shows how this routine's time and the
+time of its descendents is propagated to its (direct) call graph parents.
+.PP
+A profiled system is about 5-10% larger in its text space because of
+the calls to count the subroutine invocations.
+When the system executes,
+the profiling data is stored in a buffer that is 1.2
+times the size of the text space.
+All the information is summarized in memory,
+it is not necessary to have a trace file
+being continuously dumped to disk.
+The overhead for running a profiled system varies;
+under normal load we see anywhere from 5-25%
+of the system time spent in the profiling code.
+Thus the system is noticeably slower than an unprofiled system,
+yet is not so bad that it cannot be used in a production environment.
+This is important since it allows us to gather data
+in a real environment rather than trying to
+devise synthetic work loads.
+.NH 2
+Kernel tracing
+.PP
+The kernel can be configured to trace certain operations by
+specifying ``options TRACE'' in the configuration file. This
+forces the inclusion of code that records the occurrence of
+events in \fItrace records\fP in a circular buffer in kernel
+memory. Events may be enabled/disabled selectively while the
+system is operating. Each trace record contains a time stamp
+(taken from the VAX hardware time of day clock register), an
+event identifier, and additional information that is interpreted
+according to the event type. Buffer cache operations, such as
+initiating a read, include
+the disk drive, block number, and transfer size in the trace record.
+Virtual memory operations, such as a pagein completing, include
+the virtual address and process id in the trace record. The circular
+buffer is normally configured to hold 256 16-byte trace records.\**
+.FS
+\** The standard trace facilities distributed with 4.2
+differ slightly from those described here. The time stamp in the
+distributed system is calculated from the kernel's time of day
+variable instead of the VAX hardware register, and the buffer cache
+trace points do not record the transfer size.
+.FE
+.PP
+Several user programs were written to sample and interpret the
+tracing information. One program runs in the background and
+periodically reads the circular buffer of trace records. The
+trace information is compressed, in some instances interpreted
+to generate additional information, and a summary is written to a
+file. In addition, the sampling program can also record
+information from other kernel data structures, such as those
+interpreted by the \fIvmstat\fP program. Data written out to
+a file is further buffered to minimize I/O load.
+.PP
+Once a trace log has been created, programs that compress
+and interpret the data may be run to generate graphs showing the
+data and relationships between traced events and
+system load.
+.PP
+The trace package was used mainly to investigate the operation of
+the file system buffer cache. The sampling program maintained a
+history of read-ahead blocks and used the trace information to
+calculate, for example, percentage of read-ahead blocks used.
+.NH 2
+Benchmark programs
+.PP
+Benchmark programs were used in two ways. First, a suite of
+programs was constructed to calculate the cost of certain basic
+system operations. Operations such as system call overhead and
+context switching time are critically important in evaluating the
+overall performance of a system. Because of the drastic changes in
+the system between 4.1BSD and 4.2BSD, it was important to verify
+the overhead of these low level operations had not changed appreciably.
+.PP
+The second use of benchmarks was in exercising
+suspected bottlenecks.
+When we suspected a specific problem with the system,
+a small benchmark program was written to repeatedly use
+the facility.
+While these benchmarks are not useful as a general tool
+they can give quick feedback on whether a hypothesized
+improvement is really having an effect.
+It is important to realize that the only real assurance
+that a change has a beneficial effect is through
+long term measurements of general timesharing.
+We have numerous examples where a benchmark program
+suggests vast improvements while the change
+in the long term system performance is negligible,
+and conversely examples in which the benchmark program run more slowly,
+but the long term system performance improves significantly.
diff --git a/share/doc/papers/sysperf/3.t b/share/doc/papers/sysperf/3.t
new file mode 100644
index 000000000000..832ad4255ab3
--- /dev/null
+++ b/share/doc/papers/sysperf/3.t
@@ -0,0 +1,694 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)3.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Results of our observations
+.NH
+Results of our observations
+.PP
+When 4.2BSD was first installed on several large timesharing systems
+the degradation in performance was significant.
+Informal measurements showed 4.2BSD providing 80% of the throughput
+of 4.1BSD (based on load averages observed under a normal timesharing load).
+Many of the initial problems found were because of programs that were
+not part of 4.1BSD. Using the techniques described in the previous
+section and standard process profiling several problems were identified.
+Later work concentrated on the operation of the kernel itself.
+In this section we discuss the problems uncovered; in the next
+section we describe the changes made to the system.
+.NH 2
+User programs
+.PP
+.NH 3
+Mail system
+.PP
+The mail system was the first culprit identified as a major
+contributor to the degradation in system performance.
+At Lucasfilm the mail system is heavily used
+on one machine, a VAX-11/780 with eight megabytes of memory.\**
+.FS
+\** During part of these observations the machine had only four
+megabytes of memory.
+.FE
+Message
+traffic is usually between users on the same machine and ranges from
+person-to-person telephone messages to per-organization distribution
+lists. After conversion to 4.2BSD, it was
+immediately noticed that mail to distribution lists of 20 or more people
+caused the system load to jump by anywhere from 3 to 6 points.
+The number of processes spawned by the \fIsendmail\fP program and
+the messages sent from \fIsendmail\fP to the system logging
+process, \fIsyslog\fP, generated significant load both from their
+execution and their interference with basic system operation. The
+number of context switches and disk transfers often doubled while
+\fIsendmail\fP operated; the system call rate jumped dramatically.
+System accounting information consistently
+showed \fIsendmail\fP as the top cpu user on the system.
+.NH 3
+Network servers
+.PP
+The network services provided in 4.2BSD add new capabilities to the system,
+but are not without cost. The system uses one daemon process to accept
+requests for each network service provided. The presence of many
+such daemons increases the numbers of active processes and files,
+and requires a larger configuration to support the same number of users.
+The overhead of the routing and status updates can consume
+several percent of the cpu.
+Remote logins and shells incur more overhead
+than their local equivalents.
+For example, a remote login uses three processes and a
+pseudo-terminal handler in addition to the local hardware terminal
+handler. When using a screen editor, sending and echoing a single
+character involves four processes on two machines.
+The additional processes, context switching, network traffic, and
+terminal handler overhead can roughly triple the load presented by one
+local terminal user.
+.NH 2
+System overhead
+.PP
+To measure the costs of various functions in the kernel,
+a profiling system was run for a 17 hour
+period on one of our general timesharing machines.
+While this is not as reproducible as a synthetic workload,
+it certainly represents a realistic test.
+This test was run on several occasions over a three month period.
+Despite the long period of time that elapsed
+between the test runs the shape of the profiles,
+as measured by the number of times each system call
+entry point was called, were remarkably similar.
+.PP
+These profiles turned up several bottlenecks that are
+discussed in the next section.
+Several of these were new to 4.2BSD,
+but most were caused by overloading of mechanisms
+which worked acceptably well in previous BSD systems.
+The general conclusion from our measurements was that
+the ratio of user to system time had increased from
+45% system / 55% user in 4.1BSD to 57% system / 43% user
+in 4.2BSD.
+.NH 3
+Micro-operation benchmarks
+.PP
+To compare certain basic system operations
+between 4.1BSD and 4.2BSD a suite of benchmark
+programs was constructed and run on a VAX-11/750 with 4.5 megabytes
+of physical memory and two disks on a MASSBUS controller.
+Tests were run with the machine operating in single user mode
+under both 4.1BSD and 4.2BSD. Paging was localized to the drive
+where the root file system was located.
+.PP
+The benchmark programs were modeled after the Kashtan benchmarks,
+[Kashtan80], with identical sources compiled under each system.
+The programs and their intended purpose are described briefly
+before the presentation of the results. The benchmark scripts
+were run twice with the results shown as the average of
+the two runs.
+The source code for each program and the shell scripts used during
+the benchmarks are included in the Appendix.
+.PP
+The set of tests shown in Table 1 was concerned with
+system operations other than paging. The intent of most
+benchmarks is clear. The result of running \fIsignocsw\fP is
+deducted from the \fIcsw\fP benchmark to calculate the context
+switch overhead. The \fIexec\fP tests use two different jobs to gauge
+the cost of overlaying a larger program with a smaller one
+and vice versa. The
+``null job'' and ``big job'' differ solely in the size of their data
+segments, 1 kilobyte versus 256 kilobytes. In both cases the
+text segment of the parent is larger than that of the child.\**
+.FS
+\** These tests should also have measured the cost of expanding the
+text segment; unfortunately time did not permit running additional tests.
+.FE
+All programs were compiled into the default load format that causes
+the text segment to be demand paged out of the file system and shared
+between processes.
+.KF
+.DS L
+.TS
+center box;
+l | l.
+Test Description
+_
+syscall perform 100,000 \fIgetpid\fP system calls
+csw perform 10,000 context switches using signals
+signocsw send 10,000 signals to yourself
+pipeself4 send 10,000 4-byte messages to yourself
+pipeself512 send 10,000 512-byte messages to yourself
+pipediscard4 send 10,000 4-byte messages to child who discards
+pipediscard512 send 10,000 512-byte messages to child who discards
+pipeback4 exchange 10,000 4-byte messages with child
+pipeback512 exchange 10,000 512-byte messages with child
+forks0 fork-exit-wait 1,000 times
+forks1k sbrk(1024), fault page, fork-exit-wait 1,000 times
+forks100k sbrk(102400), fault pages, fork-exit-wait 1,000 times
+vforks0 vfork-exit-wait 1,000 times
+vforks1k sbrk(1024), fault page, vfork-exit-wait 1,000 times
+vforks100k sbrk(102400), fault pages, vfork-exit-wait 1,000 times
+execs0null fork-exec ``null job''-exit-wait 1,000 times
+execs0null (1K env) execs0null above, with 1K environment added
+execs1knull sbrk(1024), fault page, fork-exec ``null job''-exit-wait 1,000 times
+execs1knull (1K env) execs1knull above, with 1K environment added
+execs100knull sbrk(102400), fault pages, fork-exec ``null job''-exit-wait 1,000 times
+vexecs0null vfork-exec ``null job''-exit-wait 1,000 times
+vexecs1knull sbrk(1024), fault page, vfork-exec ``null job''-exit-wait 1,000 times
+vexecs100knull sbrk(102400), fault pages, vfork-exec ``null job''-exit-wait 1,000 times
+execs0big fork-exec ``big job''-exit-wait 1,000 times
+execs1kbig sbrk(1024), fault page, fork-exec ``big job''-exit-wait 1,000 times
+execs100kbig sbrk(102400), fault pages, fork-exec ``big job''-exit-wait 1,000 times
+vexecs0big vfork-exec ``big job''-exit-wait 1,000 times
+vexecs1kbig sbrk(1024), fault pages, vfork-exec ``big job''-exit-wait 1,000 times
+vexecs100kbig sbrk(102400), fault pages, vfork-exec ``big job''-exit-wait 1,000 times
+.TE
+.ce
+Table 1. Kernel Benchmark programs.
+.DE
+.KE
+.PP
+The results of these tests are shown in Table 2. If the 4.1BSD results
+are scaled to reflect their being run on a VAX-11/750, they
+correspond closely to those found in [Joy80].\**
+.FS
+\** We assume that a VAX-11/750 runs at 60% of the speed of a VAX-11/780
+(not considering floating point operations).
+.FE
+.KF
+.DS L
+.TS
+center box;
+c s s s s s s s s s
+c || c s s || c s s || c s s
+c || c s s || c s s || c s s
+c || c | c | c || c | c | c || c | c | c
+l || n | n | n || n | n | n || n | n | n.
+Berkeley Software Distribution UNIX Systems
+_
+Test Elapsed Time User Time System Time
+\^ _ _ _
+\^ 4.1 4.2 4.3 4.1 4.2 4.3 4.1 4.2 4.3
+=
+syscall 28.0 29.0 23.0 4.5 5.3 3.5 23.9 23.7 20.4
+csw 45.0 60.0 45.0 3.5 4.3 3.3 19.5 25.4 19.0
+signocsw 16.5 23.0 16.0 1.9 3.0 1.1 14.6 20.1 15.2
+pipeself4 21.5 29.0 26.0 1.1 1.1 0.8 20.1 28.0 25.6
+pipeself512 47.5 59.0 55.0 1.2 1.2 1.0 46.1 58.3 54.2
+pipediscard4 32.0 42.0 36.0 3.2 3.7 3.0 15.5 18.8 15.6
+pipediscard512 61.0 76.0 69.0 3.1 2.1 2.0 29.7 36.4 33.2
+pipeback4 57.0 75.0 66.0 2.9 3.2 3.3 25.1 34.2 29.7
+pipeback512 110.0 138.0 125.0 3.1 3.4 2.2 52.2 65.7 57.7
+forks0 37.5 41.0 22.0 0.5 0.3 0.3 34.5 37.6 21.5
+forks1k 40.0 43.0 22.0 0.4 0.3 0.3 36.0 38.8 21.6
+forks100k 217.5 223.0 176.0 0.7 0.6 0.4 214.3 218.4 175.2
+vforks0 34.5 37.0 22.0 0.5 0.6 0.5 27.3 28.5 17.9
+vforks1k 35.0 37.0 22.0 0.6 0.8 0.5 27.2 28.6 17.9
+vforks100k 35.0 37.0 22.0 0.6 0.8 0.6 27.6 28.9 17.9
+execs0null 97.5 92.0 66.0 3.8 2.4 0.6 68.7 82.5 48.6
+execs0null (1K env) 197.0 229.0 75.0 4.1 2.6 0.9 167.8 212.3 62.6
+execs1knull 99.0 100.0 66.0 4.1 1.9 0.6 70.5 86.8 48.7
+execs1knull (1K env) 199.0 230.0 75.0 4.2 2.6 0.7 170.4 214.9 62.7
+execs100knull 283.5 278.0 216.0 4.8 2.8 1.1 251.9 269.3 202.0
+vexecs0null 100.0 92.0 66.0 5.1 2.7 1.1 63.7 76.8 45.1
+vexecs1knull 100.0 91.0 66.0 5.2 2.8 1.1 63.2 77.1 45.1
+vexecs100knull 100.0 92.0 66.0 5.1 3.0 1.1 64.0 77.7 45.6
+execs0big 129.0 201.0 101.0 4.0 3.0 1.0 102.6 153.5 92.7
+execs1kbig 130.0 202.0 101.0 3.7 3.0 1.0 104.7 155.5 93.0
+execs100kbig 318.0 385.0 263.0 4.8 3.1 1.1 286.6 339.1 247.9
+vexecs0big 128.0 200.0 101.0 4.6 3.5 1.6 98.5 149.6 90.4
+vexecs1kbig 125.0 200.0 101.0 4.7 3.5 1.3 98.9 149.3 88.6
+vexecs100kbig 126.0 200.0 101.0 4.2 3.4 1.3 99.5 151.0 89.0
+.TE
+.ce
+Table 2. Kernel Benchmark results (all times in seconds).
+.DE
+.KE
+.PP
+In studying the measurements we found that the basic system call
+and context switch overhead did not change significantly
+between 4.1BSD and 4.2BSD. The \fIsignocsw\fP results were caused by
+the changes to the \fIsignal\fP interface, resulting
+in an additional subroutine invocation for each call, not
+to mention additional complexity in the system's implementation.
+.PP
+The times for the use of pipes are significantly higher under
+4.2BSD because of their implementation on top of the interprocess
+communication facilities. Under 4.1BSD pipes were implemented
+without the complexity of the socket data structures and with
+simpler code. Further, while not obviously a factor here,
+4.2BSD pipes have less system buffer space provided them than
+4.1BSD pipes.
+.PP
+The \fIexec\fP tests shown in Table 2 were performed with 34 bytes of
+environment information under 4.1BSD and 40 bytes under 4.2BSD.
+To figure the cost of passing data through the environment,
+the execs0null and execs1knull tests were rerun with
+1065 additional bytes of data. The results are show in Table 3.
+.KF
+.DS L
+.TS
+center box;
+c || c s || c s || c s
+c || c s || c s || c s
+c || c | c || c | c || c | c
+l || n | n || n | n || n | n.
+Test Real User System
+\^ _ _ _
+\^ 4.1 4.2 4.1 4.2 4.1 4.2
+=
+execs0null 197.0 229.0 4.1 2.6 167.8 212.3
+execs1knull 199.0 230.0 4.2 2.6 170.4 214.9
+.TE
+.ce
+Table 3. Benchmark results with ``large'' environment (all times in seconds).
+.DE
+.KE
+These results show that passing argument data is significantly
+slower than under 4.1BSD: 121 ms/byte versus 93 ms/byte. Even using
+this factor to adjust the basic overhead of an \fIexec\fP system
+call, this facility is more costly under 4.2BSD than under 4.1BSD.
+.NH 3
+Path name translation
+.PP
+The single most expensive function performed by the kernel
+is path name translation.
+This has been true in almost every UNIX kernel [Mosher80];
+we find that our general time sharing systems do about
+500,000 name translations per day.
+.PP
+Name translations became more expensive in 4.2BSD for several reasons.
+The single most expensive addition was the symbolic link.
+Symbolic links
+have the effect of increasing the average number of components
+in path names to be translated.
+As an insidious example,
+consider the system manager that decides to change /tmp
+to be a symbolic link to /usr/tmp.
+A name such as /tmp/tmp1234 that previously required two component
+translations,
+now requires four component translations plus the cost of reading
+the contents of the symbolic link.
+.PP
+The new directory format also changes the characteristics of
+name translation.
+The more complex format requires more computation to determine
+where to place new entries in a directory.
+Conversely the additional information allows the system to only
+look at active entries when searching,
+hence searches of directories that had once grown large
+but currently have few active entries are checked quickly.
+The new format also stores the length of each name so that
+costly string comparisons are only done on names that are the
+same length as the name being sought.
+.PP
+The net effect of the changes is that the average time to
+translate a path name in 4.2BSD is 24.2 milliseconds,
+representing 40% of the time processing system calls,
+that is 19% of the total cycles in the kernel,
+or 11% of all cycles executed on the machine.
+The times are shown in Table 4. We have no comparable times
+for \fInamei\fP under 4.1 though they are certain to
+be significantly less.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 14.3 ms/call 11.3%
+child 9.9 ms/call 7.9%
+_
+total 24.2 ms/call 19.2%
+.TE
+.ce
+Table 4. Call times for \fInamei\fP in 4.2BSD.
+.DE
+.KE
+.NH 3
+Clock processing
+.PP
+Nearly 25% of the time spent in the kernel is spent in the clock
+processing routines.
+(This is a clear indication that to avoid sampling bias when profiling the
+kernel with our tools
+we need to drive them from an independent clock.)
+These routines are responsible for implementing timeouts,
+scheduling the processor,
+maintaining kernel statistics,
+and tending various hardware operations such as
+draining the terminal input silos.
+Only minimal work is done in the hardware clock interrupt
+routine (at high priority), the rest is performed (at a lower priority)
+in a software interrupt handler scheduled by the hardware interrupt
+handler.
+In the worst case, with a clock rate of 100 Hz
+and with every hardware interrupt scheduling a software
+interrupt, the processor must field 200 interrupts per second.
+The overhead of simply trapping and returning
+is 3% of the machine cycles,
+figuring out that there is nothing to do
+requires an additional 2%.
+.NH 3
+Terminal multiplexors
+.PP
+The terminal multiplexors supported by 4.2BSD have programmable receiver
+silos that may be used in two ways.
+With the silo disabled, each character received causes an interrupt
+to the processor.
+Enabling the receiver silo allows the silo to fill before
+generating an interrupt, allowing multiple characters to be read
+for each interrupt.
+At low rates of input, received characters will not be processed
+for some time unless the silo is emptied periodically.
+The 4.2BSD kernel uses the input silos of each terminal multiplexor,
+and empties each silo on each clock interrupt.
+This allows high input rates without the cost of per-character interrupts
+while assuring low latency.
+However, as character input rates on most machines are usually
+low (about 25 characters per second),
+this can result in excessive overhead.
+At the current clock rate of 100 Hz, a machine with 5 terminal multiplexors
+configured makes 500 calls to the receiver interrupt routines per second.
+In addition, to achieve acceptable input latency
+for flow control, each clock interrupt must schedule
+a software interrupt to run the silo draining routines.\**
+.FS
+\** It is not possible to check the input silos at
+the time of the actual clock interrupt without modifying the terminal
+line disciplines, as the input queues may not be in a consistent state \**.
+.FE
+\** This implies that the worst case estimate for clock processing
+is the basic overhead for clock processing.
+.NH 3
+Process table management
+.PP
+In 4.2BSD there are numerous places in the kernel where a linear search
+of the process table is performed:
+.IP \(bu 3
+in \fIexit\fP to locate and wakeup a process's parent;
+.IP \(bu 3
+in \fIwait\fP when searching for \fB\s-2ZOMBIE\s+2\fP and
+\fB\s-2STOPPED\s+2\fP processes;
+.IP \(bu 3
+in \fIfork\fP when allocating a new process table slot and
+counting the number of processes already created by a user;
+.IP \(bu 3
+in \fInewproc\fP, to verify
+that a process id assigned to a new process is not currently
+in use;
+.IP \(bu 3
+in \fIkill\fP and \fIgsignal\fP to locate all processes to
+which a signal should be delivered;
+.IP \(bu 3
+in \fIschedcpu\fP when adjusting the process priorities every
+second; and
+.IP \(bu 3
+in \fIsched\fP when locating a process to swap out and/or swap
+in.
+.LP
+These linear searches can incur significant overhead. The rule
+for calculating the size of the process table is:
+.ce
+nproc = 20 + 8 * maxusers
+.sp
+that means a 48 user system will have a 404 slot process table.
+With the addition of network services in 4.2BSD, as many as a dozen
+server processes may be maintained simply to await incoming requests.
+These servers are normally created at boot time which causes them
+to be allocated slots near the beginning of the process table. This
+means that process table searches under 4.2BSD are likely to take
+significantly longer than under 4.1BSD. System profiling shows
+that as much as 20% of the time spent in the kernel on a loaded
+system (a VAX-11/780) can be spent in \fIschedcpu\fP and, on average,
+5-10% of the kernel time is spent in \fIschedcpu\fP.
+The other searches of the proc table are similarly affected.
+This shows the system can no longer tolerate using linear searches of
+the process table.
+.NH 3
+File system buffer cache
+.PP
+The trace facilities described in section 2.3 were used
+to gather statistics on the performance of the buffer cache.
+We were interested in measuring the effectiveness of the
+cache and the read-ahead policies.
+With the file system block size in 4.2BSD four to
+eight times that of a 4.1BSD file system, we were concerned
+that large amounts of read-ahead might be performed without
+being used. Also, we were interested in seeing if the
+rules used to size the buffer cache at boot time were severely
+affecting the overall cache operation.
+.PP
+The tracing package was run over a three hour period during
+a peak mid-afternoon period on a VAX 11/780 with four megabytes
+of physical memory.
+This resulted in a buffer cache containing 400 kilobytes of memory
+spread among 50 to 200 buffers
+(the actual number of buffers depends on the size mix of
+disk blocks being read at any given time).
+The pertinent configuration information is shown in Table 5.
+.KF
+.DS L
+.TS
+center box;
+l l l l.
+Controller Drive Device File System
+_
+DEC MASSBUS DEC RP06 hp0d /usr
+ hp0b swap
+Emulex SC780 Fujitsu Eagle hp1a /usr/spool/news
+ hp1b swap
+ hp1e /usr/src
+ hp1d /u0 (users)
+ Fujitsu Eagle hp2a /tmp
+ hp2b swap
+ hp2d /u1 (users)
+ Fujitsu Eagle hp3a /
+.TE
+.ce
+Table 5. Active file systems during buffer cache tests.
+.DE
+.KE
+.PP
+During the test period the load average ranged from 2 to 13
+with an average of 5.
+The system had no idle time, 43% user time, and 57% system time.
+The system averaged 90 interrupts per second
+(excluding the system clock interrupts),
+220 system calls per second,
+and 50 context switches per second (40 voluntary, 10 involuntary).
+.PP
+The active virtual memory (the sum of the address space sizes of
+all jobs that have run in the previous twenty seconds)
+over the period ranged from 2 to 6 megabytes with an average
+of 3.5 megabytes.
+There was no swapping, though the page daemon was inspecting
+about 25 pages per second.
+.PP
+On average 250 requests to read disk blocks were initiated
+per second.
+These include read requests for file blocks made by user
+programs as well as requests initiated by the system.
+System reads include requests for indexing information to determine
+where a file's next data block resides,
+file system layout maps to allocate new data blocks,
+and requests for directory contents needed to do path name translations.
+.PP
+On average, an 85% cache hit rate was observed for read requests.
+Thus only 37 disk reads were initiated per second.
+In addition, 5 read-ahead requests were made each second
+filling about 20% of the buffer pool.
+Despite the policies to rapidly reuse read-ahead buffers
+that remain unclaimed, more than 90% of the read-ahead
+buffers were used.
+.PP
+These measurements showed that the buffer cache was working
+effectively. Independent tests have also showed that the size
+of the buffer cache may be reduced significantly on memory-poor
+system without severe effects;
+we have not yet tested this hypothesis [Shannon83].
+.NH 3
+Network subsystem
+.PP
+The overhead associated with the
+network facilities found in 4.2BSD is often
+difficult to gauge without profiling the system.
+This is because most input processing is performed
+in modules scheduled with software interrupts.
+As a result, the system time spent performing protocol
+processing is rarely attributed to the processes that
+really receive the data. Since the protocols supported
+by 4.2BSD can involve significant overhead this was a serious
+concern. Results from a profiled kernel show an average
+of 5% of the system time is spent
+performing network input and timer processing in our environment
+(a 3Mb/s Ethernet with most traffic using TCP).
+This figure can vary significantly depending on
+the network hardware used, the average message
+size, and whether packet reassembly is required at the network
+layer. On one machine we profiled over a 17 hour
+period (our gateway to the ARPANET)
+206,000 input messages accounted for 2.4% of the system time,
+while another 0.6% of the system time was spent performing
+protocol timer processing.
+This machine was configured with an ACC LH/DH IMP interface
+and a DMA 3Mb/s Ethernet controller.
+.PP
+The performance of TCP over slower long-haul networks
+was degraded substantially by two problems.
+The first problem was a bug that prevented round-trip timing measurements
+from being made, thus increasing retransmissions unnecessarily.
+The second was a problem with the maximum segment size chosen by TCP,
+that was well-tuned for Ethernet, but was poorly chosen for
+the ARPANET, where it causes packet fragmentation. (The maximum
+segment size was actually negotiated upwards to a value that
+resulted in excessive fragmentation.)
+.PP
+When benchmarked in Ethernet environments the main memory buffer management
+of the network subsystem presented some performance anomalies.
+The overhead of processing small ``mbufs'' severely affected throughput for a
+substantial range of message sizes.
+In spite of the fact that most system ustilities made use of the throughput
+optimal 1024 byte size, user processes faced large degradations for some
+arbitrary sizes. This was specially true for TCP/IP transmissions [Cabrera84,
+Cabrera85].
+.NH 3
+Virtual memory subsystem
+.PP
+We ran a set of tests intended to exercise the virtual
+memory system under both 4.1BSD and 4.2BSD.
+The tests are described in Table 6.
+The test programs dynamically allocated
+a 7.3 Megabyte array (using \fIsbrk\fP\|(2)) then referenced
+pages in the array either: sequentially, in a purely random
+fashion, or such that the distance between
+successive pages accessed was randomly selected from a Gaussian
+distribution. In the last case, successive runs were made with
+increasing standard deviations.
+.KF
+.DS L
+.TS
+center box;
+l | l.
+Test Description
+_
+seqpage sequentially touch pages, 10 iterations
+seqpage-v as above, but first make \fIvadvise\fP\|(2) call
+randpage touch random page 30,000 times
+randpage-v as above, but first make \fIvadvise\fP call
+gausspage.1 30,000 Gaussian accesses, standard deviation of 1
+gausspage.10 as above, standard deviation of 10
+gausspage.30 as above, standard deviation of 30
+gausspage.40 as above, standard deviation of 40
+gausspage.50 as above, standard deviation of 50
+gausspage.60 as above, standard deviation of 60
+gausspage.80 as above, standard deviation of 80
+gausspage.inf as above, standard deviation of 10,000
+.TE
+.ce
+Table 6. Paging benchmark programs.
+.DE
+.KE
+.PP
+The results in Table 7 show how the additional
+memory requirements
+of 4.2BSD can generate more work for the paging system.
+Under 4.1BSD,
+the system used 0.5 of the 4.5 megabytes of physical memory
+on the test machine;
+under 4.2BSD it used nearly 1 megabyte of physical memory.\**
+.FS
+\** The 4.1BSD system used for testing was really a 4.1a
+system configured
+with networking facilities and code to support
+remote file access. The
+4.2BSD system also included the remote file access code.
+Since both
+systems would be larger than similarly configured ``vanilla''
+4.1BSD or 4.2BSD system, we consider out conclusions to still be valid.
+.FE
+This resulted in more page faults and, hence, more system time.
+To establish a common ground on which to compare the paging
+routines of each system, we check instead the average page fault
+service times for those test runs that had a statistically significant
+number of random page faults. These figures, shown in Table 8, show
+no significant difference between the two systems in
+the area of page fault servicing. We currently have
+no explanation for the results of the sequential
+paging tests.
+.KF
+.DS L
+.TS
+center box;
+l || c s || c s || c s || c s
+l || c s || c s || c s || c s
+l || c | c || c | c || c | c || c | c
+l || n | n || n | n || n | n || n | n.
+Test Real User System Page Faults
+\^ _ _ _ _
+\^ 4.1 4.2 4.1 4.2 4.1 4.2 4.1 4.2
+=
+seqpage 959 1126 16.7 12.8 197.0 213.0 17132 17113
+seqpage-v 579 812 3.8 5.3 216.0 237.7 8394 8351
+randpage 571 569 6.7 7.6 64.0 77.2 8085 9776
+randpage-v 572 562 6.1 7.3 62.2 77.5 8126 9852
+gausspage.1 25 24 23.6 23.8 0.8 0.8 8 8
+gausspage.10 26 26 22.7 23.0 3.2 3.6 2 2
+gausspage.30 34 33 25.0 24.8 8.6 8.9 2 2
+gausspage.40 42 81 23.9 25.0 11.5 13.6 3 260
+gausspage.50 113 175 24.2 26.2 19.6 26.3 784 1851
+gausspage.60 191 234 27.6 26.7 27.4 36.0 2067 3177
+gausspage.80 312 329 28.0 27.9 41.5 52.0 3933 5105
+gausspage.inf 619 621 82.9 85.6 68.3 81.5 8046 9650
+.TE
+.ce
+Table 7. Paging benchmark results (all times in seconds).
+.DE
+.KE
+.KF
+.DS L
+.TS
+center box;
+c || c s || c s
+c || c s || c s
+c || c | c || c | c
+l || n | n || n | n.
+Test Page Faults PFST
+\^ _ _
+\^ 4.1 4.2 4.1 4.2
+=
+randpage 8085 9776 791 789
+randpage-v 8126 9852 765 786
+gausspage.inf 8046 9650 848 844
+.TE
+.ce
+Table 8. Page fault service times (all times in microseconds).
+.DE
+.KE
diff --git a/share/doc/papers/sysperf/4.t b/share/doc/papers/sysperf/4.t
new file mode 100644
index 000000000000..cdbfb1438a42
--- /dev/null
+++ b/share/doc/papers/sysperf/4.t
@@ -0,0 +1,774 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)4.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Performance Improvements
+.NH
+Performance Improvements
+.PP
+This section outlines the changes made to the system
+since the 4.2BSD distribution.
+The changes reported here were made in response
+to the problems described in Section 3.
+The improvements fall into two major classes;
+changes to the kernel that are described in this section,
+and changes to the system libraries and utilities that are
+described in the following section.
+.NH 2
+Performance Improvements in the Kernel
+.PP
+Our goal has been to optimize system performance
+for our general timesharing environment.
+Since most sites running 4.2BSD have been forced to take
+advantage of declining
+memory costs rather than replace their existing machines with
+ones that are more powerful, we have
+chosen to optimize running time at the expense of memory.
+This tradeoff may need to be reconsidered for personal workstations
+that have smaller memories and higher latency disks.
+Decreases in the running time of the system may be unnoticeable
+because of higher paging rates incurred by a larger kernel.
+Where possible, we have allowed the size of caches to be controlled
+so that systems with limited memory may reduce them as appropriate.
+.NH 3
+Name Cacheing
+.PP
+Our initial profiling studies showed that more than one quarter
+of the time in the system was spent in the
+pathname translation routine, \fInamei\fP,
+translating path names to inodes\u\s-21\s0\d\**.
+.FS
+\** \u\s-21\s0\d Inode is an abbreviation for ``Index node''.
+Each file on the system is described by an inode;
+the inode maintains access permissions, and an array of pointers to
+the disk blocks that hold the data associated with the file.
+.FE
+An inspection of \fInamei\fP shows that
+it consists of two nested loops.
+The outer loop is traversed once per pathname component.
+The inner loop performs a linear search through a directory looking
+for a particular pathname component.
+.PP
+Our first idea was to reduce the number of iterations
+around the inner loop of \fInamei\fP by observing that many programs
+step through a directory performing an operation on each entry in turn.
+To improve performance for processes doing directory scans,
+the system keeps track of the directory offset of the last component of the
+most recently translated path name for each process.
+If the next name the process requests is in the same directory,
+the search is started from the offset that the previous name was found
+(instead of from the beginning of the directory).
+Changing directories invalidates the cache, as
+does modifying the directory.
+For programs that step sequentially through a directory with
+.EQ
+delim $$
+.EN
+$N$ files, search time decreases from $O ( N sup 2 )$ to $O(N)$.
+.EQ
+delim off
+.EN
+.PP
+The cost of the cache is about 20 lines of code
+(about 0.2 kilobytes)
+and 16 bytes per process, with the cached data
+stored in a process's \fIuser\fP vector.
+.PP
+As a quick benchmark to verify the maximum effectiveness of the
+cache we ran ``ls \-l''
+on a directory containing 600 files.
+Before the per-process cache this command
+used 22.3 seconds of system time.
+After adding the cache the program used the same amount
+of user time, but the system time dropped to 3.3 seconds.
+.PP
+This change prompted our rerunning a profiled system
+on a machine containing the new \fInamei\fP.
+The results showed that the time in \fInamei\fP
+dropped by only 2.6 ms/call and
+still accounted for 36% of the system call time,
+18% of the kernel, or about 10% of all the machine cycles.
+This amounted to a drop in system time from 57% to about 55%.
+The results are shown in Table 9.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 11.0 ms/call 9.2%
+child 10.6 ms/call 8.9%
+_
+total 21.6 ms/call 18.1%
+.TE
+.ce
+Table 9. Call times for \fInamei\fP with per-process cache.
+.DE
+.KE
+.PP
+The small performance improvement
+was caused by a low cache hit ratio.
+Although the cache was 90% effective when hit,
+it was only usable on about 25% of the names being translated.
+An additional reason for the small improvement was that
+although the amount of time spent in \fInamei\fP itself
+decreased substantially,
+more time was spent in the routines that it called
+since each directory had to be accessed twice;
+once to search from the middle to the end,
+and once to search from the beginning to the middle.
+.PP
+Frequent requests for a small set of names are best handled
+with a cache of recent name translations\**.
+.FS
+\** The cache is keyed on a name and the
+inode and device number of the directory that contains it.
+Associated with each entry is a pointer to the corresponding
+entry in the inode table.
+.FE
+This has the effect of eliminating the inner loop of \fInamei\fP.
+For each path name component,
+\fInamei\fP first looks in its cache of recent translations
+for the needed name.
+If it exists, the directory search can be completely eliminated.
+.PP
+The system already maintained a cache of recently accessed inodes,
+so the initial name cache
+maintained a simple name-inode association that was used to
+check each component of a path name during name translations.
+We considered implementing the cache by tagging each inode
+with its most recently translated name,
+but eventually decided to have a separate data structure that
+kept names with pointers to the inode table.
+Tagging inodes has two drawbacks;
+many inodes such as those associated with login ports remain in
+the inode table for a long period of time, but are never looked
+up by name.
+Other inodes, such as those describing directories are looked up
+frequently by many different names (\fIe.g.\fP ``..'').
+By keeping a separate table of names, the cache can
+truly reflect the most recently used names.
+An added benefit is that the table can be sized independently
+of the inode table, so that machines with small amounts of memory
+can reduce the size of the cache (or even eliminate it)
+without modifying the inode table structure.
+.PP
+Another issue to be considered is how the name cache should
+hold references to the inode table.
+Normally processes hold ``hard references'' by incrementing the
+reference count in the inode they reference.
+Since the system reuses only inodes with zero reference counts,
+a hard reference insures that the inode pointer will remain valid.
+However, if the name cache holds hard references,
+it is limited to some fraction of the size of the inode table,
+since some inodes must be left free for new files.
+It also makes it impossible for other parts of the kernel
+to verify sole use of a device or file.
+These reasons made it impractical to use hard references
+without affecting the behavior of the inode cacheing scheme.
+Thus, we chose instead to keep ``soft references'' protected
+by a \fIcapability\fP \- a 32-bit number
+guaranteed to be unique\u\s-22\s0\d \**.
+.FS
+\** \u\s-22\s0\d When all the numbers have been exhausted, all outstanding
+capabilities are purged and numbering starts over from scratch.
+Purging is possible as all capabilities are easily found in kernel memory.
+.FE
+When an entry is made in the name cache,
+the capability of its inode is copied to the name cache entry.
+When an inode is reused it is issued a new capability.
+When a name cache hit occurs,
+the capability of the name cache entry is compared
+with the capability of the inode that it references.
+If the capabilities do not match, the name cache entry is invalid.
+Since the name cache holds only soft references,
+it may be sized independent of the size of the inode table.
+A final benefit of using capabilities is that all
+cached names for an inode may be invalidated without
+searching through the entire cache;
+instead all you need to do is assign a new capability to the inode.
+.PP
+The cost of the name cache is about 200 lines of code
+(about 1.2 kilobytes)
+and 48 bytes per cache entry.
+Depending on the size of the system,
+about 200 to 1000 entries will normally be configured,
+using 10-50 kilobytes of physical memory.
+The name cache is resident in memory at all times.
+.PP
+After adding the system wide name cache we reran ``ls \-l''
+on the same directory.
+The user time remained the same,
+however the system time rose slightly to 3.7 seconds.
+This was not surprising as \fInamei\fP
+now had to maintain the cache,
+but was never able to make any use of it.
+.PP
+Another profiled system was created and measurements
+were collected over a 17 hour period. These measurements
+showed a 13 ms/call decrease in \fInamei\fP, with
+\fInamei\fP accounting for only 26% of the system call time,
+13% of the time in the kernel,
+or about 7% of all the machine cycles.
+System time dropped from 55% to about 49%.
+The results are shown in Table 10.
+.KF
+.DS L
+.TS
+center box;
+l r r.
+part time % of kernel
+_
+self 4.2 ms/call 6.2%
+child 4.4 ms/call 6.6%
+_
+total 8.6 ms/call 12.8%
+.TE
+.ce
+Table 10. Call times for \fInamei\fP with both caches.
+.DE
+.KE
+.PP
+On our general time sharing systems we find that during the twelve
+hour period from 8AM to 8PM the system does 500,000 to 1,000,000
+name translations.
+Statistics on the performance of both caches show that
+the large performance improvement is
+caused by the high hit ratio.
+The name cache has a hit rate of 70%-80%;
+the directory offset cache gets a hit rate of 5%-15%.
+The combined hit rate of the two caches almost always adds up to 85%.
+With the addition of the two caches,
+the percentage of system time devoted to name translation has
+dropped from 25% to less than 13%.
+While the system wide cache reduces both the amount of time in
+the routines that \fInamei\fP calls as well as \fInamei\fP itself
+(since fewer directories need to be accessed or searched),
+it is interesting to note that the actual percentage of system
+time spent in \fInamei\fP itself increases even though the
+actual time per call decreases.
+This is because less total time is being spent in the kernel,
+hence a smaller absolute time becomes a larger total percentage.
+.NH 3
+Intelligent Auto Siloing
+.PP
+Most terminal input hardware can run in two modes:
+it can either generate an interrupt each time a character is received,
+or collect characters in a silo that the system then periodically drains.
+To provide quick response for interactive input and flow control,
+a silo must be checked 30 to 50 times per second.
+Ascii terminals normally exhibit
+an input rate of less than 30 characters per second.
+At this input rate
+they are most efficiently handled with interrupt per character mode,
+since this generates fewer interrupts than draining the input silos
+of the terminal multiplexors at each clock interrupt.
+When input is being generated by another machine
+or a malfunctioning terminal connection, however,
+the input rate is usually more than 50 characters per second.
+It is more efficient to use a device's silo input mode,
+since this generates fewer interrupts than handling each character
+as a separate interrupt.
+Since a given dialup port may switch between uucp logins and user logins,
+it is impossible to statically select the most efficient input mode to use.
+.PP
+We therefore changed the terminal multiplexor handlers
+to dynamically choose between the use of the silo and the use of
+per-character interrupts.
+At low input rates the handler processes characters on an
+interrupt basis, avoiding the overhead
+of checking each interface on each clock interrupt.
+During periods of sustained input, the handler enables the silo
+and starts a timer to drain input.
+This timer runs less frequently than the clock interrupts,
+and is used only when there is a substantial amount of input.
+The transition from using silos to an interrupt per character is
+damped to minimize the number of transitions with bursty traffic
+(such as in network communication).
+Input characters serve to flush the silo, preventing long latency.
+By switching between these two modes of operation dynamically,
+the overhead of checking the silos is incurred only
+when necessary.
+.PP
+In addition to the savings in the terminal handlers,
+the clock interrupt routine is no longer required to schedule
+a software interrupt after each hardware interrupt to drain the silos.
+The software-interrupt level portion of the clock routine is only
+needed when timers expire or the current user process is collecting
+an execution profile.
+Thus, the number of interrupts attributable to clock processing
+is substantially reduced.
+.NH 3
+Process Table Management
+.PP
+As systems have grown larger, the size of the process table
+has grown far past 200 entries.
+With large tables, linear searches must be eliminated
+from any frequently used facility.
+The kernel process table is now multi-threaded to allow selective searching
+of active and zombie processes.
+A third list threads unused process table slots.
+Free slots can be obtained in constant time by taking one
+from the front of the free list.
+The number of processes used by a given user may be computed by scanning
+only the active list.
+Since the 4.2BSD release,
+the kernel maintained linked lists of the descendents of each process.
+This linkage is now exploited when dealing with process exit;
+parents seeking the exit status of children now avoid linear search
+of the process table, but examine only their direct descendents.
+In addition, the previous algorithm for finding all descendents of an exiting
+process used multiple linear scans of the process table.
+This has been changed to follow the links between child process and siblings.
+.PP
+When forking a new process,
+the system must assign it a unique process identifier.
+The system previously scanned the entire process table each time it created
+a new process to locate an identifier that was not already in use.
+Now, to avoid scanning the process table for each new process,
+the system computes a range of unused identifiers
+that can be directly assigned.
+Only when the set of identifiers is exhausted is another process table
+scan required.
+.NH 3
+Scheduling
+.PP
+Previously the scheduler scanned the entire process table
+once per second to recompute process priorities.
+Processes that had run for their entire time slice had their
+priority lowered.
+Processes that had not used their time slice, or that had
+been sleeping for the past second had their priority raised.
+On systems running many processes,
+the scheduler represented nearly 20% of the system time.
+To reduce this overhead,
+the scheduler has been changed to consider only
+runnable processes when recomputing priorities.
+To insure that processes sleeping for more than a second
+still get their appropriate priority boost,
+their priority is recomputed when they are placed back on the run queue.
+Since the set of runnable process is typically only a small fraction
+of the total number of processes on the system,
+the cost of invoking the scheduler drops proportionally.
+.NH 3
+Clock Handling
+.PP
+The hardware clock interrupts the processor 100 times per second
+at high priority.
+As most of the clock-based events need not be done at high priority,
+the system schedules a lower priority software interrupt to do the less
+time-critical events such as cpu scheduling and timeout processing.
+Often there are no such events, and the software interrupt handler
+finds nothing to do and returns.
+The high priority event now checks to see if there are low priority
+events to process;
+if there is nothing to do, the software interrupt is not requested.
+Often, the high priority interrupt occurs during a period when the
+machine had been running at low priority.
+Rather than posting a software interrupt that would occur as
+soon as it returns,
+the hardware clock interrupt handler simply lowers the processor priority
+and calls the software clock routines directly.
+Between these two optimizations, nearly 80 of the 100 software
+interrupts per second can be eliminated.
+.NH 3
+File System
+.PP
+The file system uses a large block size, typically 4096 or 8192 bytes.
+To allow small files to be stored efficiently, the large blocks can
+be broken into smaller fragments, typically multiples of 1024 bytes.
+To minimize the number of full-sized blocks that must be broken
+into fragments, the file system uses a best fit strategy.
+Programs that slowly grow files using write of 1024 bytes or less
+can force the file system to copy the data to
+successively larger and larger fragments until it finally
+grows to a full sized block.
+The file system still uses a best fit strategy the first time
+a fragment is written.
+However, the first time that the file system is forced to copy a growing
+fragment it places it at the beginning of a full sized block.
+Continued growth can be accommodated without further copying
+by using up the rest of the block.
+If the file ceases to grow, the rest of the block is still
+available for holding other fragments.
+.PP
+When creating a new file name,
+the entire directory in which it will reside must be scanned
+to insure that the name does not already exist.
+For large directories, this scan is time consuming.
+Because there was no provision for shortening directories,
+a directory that is once over-filled will increase the cost
+of file creation even after the over-filling is corrected.
+Thus, for example, a congested uucp connection can leave a legacy long
+after it is cleared up.
+To alleviate the problem, the system now deletes empty blocks
+that it finds at the end of a directory while doing a complete
+scan to create a new name.
+.NH 3
+Network
+.PP
+The default amount of buffer space allocated for stream sockets (including
+pipes) has been increased to 4096 bytes.
+Stream sockets and pipes now return their buffer sizes in the block size field
+of the stat structure.
+This information allows the standard I/O library to use more optimal buffering.
+Unix domain stream sockets also return a dummy device and inode number
+in the stat structure to increase compatibility
+with other pipe implementations.
+The TCP maximum segment size is calculated according to the destination
+and interface in use; non-local connections use a more conservative size
+for long-haul networks.
+.PP
+On multiply-homed hosts, the local address bound by TCP now always corresponds
+to the interface that will be used in transmitting data packets for the
+connection.
+Several bugs in the calculation of round trip timing have been corrected.
+TCP now switches to an alternate gateway when an existing route fails,
+or when an ICMP redirect message is received.
+ICMP source quench messages are used to throttle the transmission
+rate of TCP streams by temporarily creating an artificially small
+send window, and retransmissions send only a single packet
+rather than resending all queued data.
+A send policy has been implemented
+that decreases the number of small packets outstanding
+for network terminal traffic [Nagle84],
+providing additional reduction of network congestion.
+The overhead of packet routing has been decreased by changes in the routing
+code and by cacheing the most recently used route for each datagram socket.
+.PP
+The buffer management strategy implemented by \fIsosend\fP has been
+changed to make better use of the increased size of the socket buffers
+and a better tuned delayed acknowledgement algorithm.
+Routing has been modified to include a one element cache of the last
+route computed.
+Multiple messages send with the same destination now require less processing.
+Performance deteriorates because of load in
+either the sender host, receiver host, or ether.
+Also, any CPU contention degrades substantially
+the throughput achievable by user processes [Cabrera85].
+We have observed empty VAX 11/750s using up to 90% of their cycles
+transmitting network messages.
+.NH 3
+Exec
+.PP
+When \fIexec\fP-ing a new process, the kernel creates the new
+program's argument list by copying the arguments and environment
+from the parent process's address space into the system, then back out
+again onto the stack of the newly created process.
+These two copy operations were done one byte at a time, but
+are now done a string at a time.
+This optimization reduced the time to process
+an argument list by a factor of ten;
+the average time to do an \fIexec\fP call decreased by 25%.
+.NH 3
+Context Switching
+.PP
+The kernel used to post a software event when it wanted to force
+a process to be rescheduled.
+Often the process would be rescheduled for other reasons before
+exiting the kernel, delaying the event trap.
+At some later time the process would again
+be selected to run and would complete its pending system call,
+finally causing the event to take place.
+The event would cause the scheduler to be invoked a second time
+selecting the same process to run.
+The fix to this problem is to cancel any software reschedule
+events when saving a process context.
+This change doubles the speed with which processes
+can synchronize using pipes or signals.
+.NH 3
+Setjmp/Longjmp
+.PP
+The kernel routine \fIsetjmp\fP, that saves the current system
+context in preparation for a non-local goto used to save many more
+registers than necessary under most circumstances.
+By trimming its operation to save only the minimum state required,
+the overhead for system calls decreased by an average of 13%.
+.NH 3
+Compensating for Lack of Compiler Technology
+.PP
+The current compilers available for C do not
+do any significant optimization.
+Good optimizing compilers are unlikely to be built;
+the C language is not well suited to optimization
+because of its rampant use of unbound pointers.
+Thus, many classical optimizations such as common subexpression
+analysis and selection of register variables must be done
+by hand using ``exterior'' knowledge of when such optimizations are safe.
+.PP
+Another optimization usually done by optimizing compilers
+is inline expansion of small or frequently used routines.
+In past Berkeley systems this has been done by using \fIsed\fP to
+run over the assembly language and replace calls to small
+routines with the code for the body of the routine, often
+a single VAX instruction.
+While this optimization eliminated the cost of the subroutine
+call and return,
+it did not eliminate the pushing and popping of several arguments
+to the routine.
+The \fIsed\fP script has been replaced by a more intelligent expander,
+\fIinline\fP, that merges the pushes and pops into moves to registers.
+For example, if the C code
+.DS
+if (scanc(map[i], 1, 47, i - 63))
+.DE
+is compiled into assembly language it generates the code shown
+in the left hand column of Table 11.
+The \fIsed\fP inline expander changes this code to that
+shown in the middle column.
+The newer optimizer eliminates most of the stack
+operations to generate the code shown in the right hand column.
+.KF
+.TS
+center, box;
+c s s s s s
+c s | c s | c s
+l l | l l | l l.
+Alternative C Language Code Optimizations
+_
+cc sed inline
+_
+subl3 $64,_i,\-(sp) subl3 $64,_i,\-(sp) subl3 $64,_i,r5
+pushl $47 pushl $47 movl $47,r4
+pushl $1 pushl $1 pushl $1
+mull2 $16,_i,r3 mull2 $16,_i,r3 mull2 $16,_i,r3
+pushl \-56(fp)[r3] pushl \-56(fp)[r3] movl \-56(fp)[r3],r2
+calls $4,_scanc movl (sp)+,r5 movl (sp)+,r3
+tstl r0 movl (sp)+,r4 scanc r2,(r3),(r4),r5
+jeql L7 movl (sp)+,r3 tstl r0
+ movl (sp)+,r2 jeql L7
+ scanc r2,(r3),(r4),r5
+ tstl r0
+ jeql L7
+.TE
+.ce
+Table 11. Alternative inline code expansions.
+.KE
+.PP
+Another optimization involved reevaluating
+existing data structures in the context of the current system.
+For example, disk buffer hashing was implemented when the system
+typically had thirty to fifty buffers.
+Most systems today have 200 to 1000 buffers.
+Consequently, most of the hash chains contained
+ten to a hundred buffers each!
+The running time of the low level buffer management primitives was
+dramatically improved simply by enlarging the size of the hash table.
+.NH 2
+Improvements to Libraries and Utilities
+.PP
+Intuitively, changes to the kernel would seem to have the greatest
+payoff since they affect all programs that run on the system.
+However, the kernel has been tuned many times before, so the
+opportunity for significant improvement was small.
+By contrast, many of the libraries and utilities had never been tuned.
+For example, we found utilities that spent 90% of their
+running time doing single character read system calls.
+Changing the utility to use the standard I/O library cut the
+running time by a factor of five!
+Thus, while most of our time has been spent tuning the kernel,
+more than half of the speedups are because of improvements in
+other parts of the system.
+Some of the more dramatic changes are described in the following
+subsections.
+.NH 3
+Hashed Databases
+.PP
+UNIX provides a set of database management routines, \fIdbm\fP,
+that can be used to speed lookups in large data files
+with an external hashed index file.
+The original version of dbm was designed to work with only one
+database at a time. These routines were generalized to handle
+multiple database files, enabling them to be used in rewrites
+of the password and host file lookup routines. The new routines
+used to access the password file significantly improve the running
+time of many important programs such as the mail subsystem,
+the C-shell (in doing tilde expansion), \fIls \-l\fP, etc.
+.NH 3
+Buffered I/O
+.PP
+The new filesystem with its larger block sizes allows better
+performance, but it is possible to degrade system performance
+by performing numerous small transfers rather than using
+appropriately-sized buffers.
+The standard I/O library
+automatically determines the optimal buffer size for each file.
+Some C library routines and commonly-used programs use low-level
+I/O or their own buffering, however.
+Several important utilities that did not use the standard I/O library
+and were buffering I/O using the old optimal buffer size,
+1Kbytes; the programs were changed to buffer I/O according to the
+optimal file system blocksize.
+These include the editor, the assembler, loader, remote file copy,
+the text formatting programs, and the C compiler.
+.PP
+The standard error output has traditionally been unbuffered
+to prevent delay in presenting the output to the user,
+and to prevent it from being lost if buffers are not flushed.
+The inordinate expense of sending single-byte packets through
+the network led us to impose a buffering scheme on the standard
+error stream.
+Within a single call to \fIfprintf\fP, all output is buffered temporarily.
+Before the call returns, all output is flushed and the stream is again
+marked unbuffered.
+As before, the normal block or line buffering mechanisms can be used
+instead of the default behavior.
+.PP
+It is possible for programs with good intentions to unintentionally
+defeat the standard I/O library's choice of I/O buffer size by using
+the \fIsetbuf\fP call to assign an output buffer.
+Because of portability requirements, the default buffer size provided
+by \fIsetbuf\fP is 1024 bytes; this can lead, once again, to added
+overhead.
+One such program with this problem was \fIcat\fP;
+there are undoubtedly other standard system utilities with similar problems
+as the system has changed much since they were originally written.
+.NH 3
+Mail System
+.PP
+The problems discussed in section 3.1.1 prompted significant work
+on the entire mail system. The first problem identified was a bug
+in the \fIsyslog\fP program. The mail delivery program, \fIsendmail\fP
+logs all mail transactions through this process with the 4.2BSD interprocess
+communication facilities. \fISyslog\fP then records the information in
+a log file. Unfortunately, \fIsyslog\fP was performing a \fIsync\fP
+operation after each message it received, whether it was logged to a file
+or not. This wreaked havoc on the effectiveness of the
+buffer cache and explained, to a large
+extent, why sending mail to large distribution lists generated such a
+heavy load on the system (one syslog message was generated for each
+message recipient causing almost a continuous sequence of sync operations).
+.PP
+The hashed data base files were
+installed in all mail programs, resulting in a order of magnitude
+speedup on large distribution lists. The code in \fI/bin/mail\fP
+that notifies the \fIcomsat\fP program when mail has been delivered to
+a user was changed to cache host table lookups, resulting in a similar
+speedup on large distribution lists.
+.PP
+Next, the file locking facilities
+provided in 4.2BSD, \fIflock\fP\|(2), were used in place of the old
+locking mechanism.
+The mail system previously used \fIlink\fP and \fIunlink\fP in
+implementing file locking primitives.
+Because these operations usually modify the contents of directories
+they require synchronous disk operations and cannot take
+advantage of the name cache maintained by the system.
+Unlink requires that the entry be found in the directory so that
+it can be removed;
+link requires that the directory be scanned to insure that the name
+does not already exist.
+By contrast the advisory locking facility in 4.2BSD is
+efficient because it is all done with in-memory tables.
+Thus, the mail system was modified to use the file locking primitives.
+This yielded another 10% cut in the basic overhead of delivering mail.
+Extensive profiling and tuning of \fIsendmail\fP and
+compiling it without debugging code reduced the overhead by another 20%.
+.NH 3
+Network Servers
+.PP
+With the introduction of the network facilities in 4.2BSD,
+a myriad of services became available, each of which
+required its own daemon process.
+Many of these daemons were rarely if ever used,
+yet they lay asleep in the process table consuming
+system resources and generally slowing down response.
+Rather than having many servers started at boot time, a single server,
+\fIinetd\fP was substituted.
+This process reads a simple configuration file
+that specifies the services the system is willing to support
+and listens for service requests on each service's Internet port.
+When a client requests service the appropriate server is created
+and passed a service connection as its standard input. Servers
+that require the identity of their client may use the \fIgetpeername\fP
+system call; likewise \fIgetsockname\fP may be used to find out
+a server's local address without consulting data base files.
+This scheme is attractive for several reasons:
+.IP \(bu 3
+it eliminates
+as many as a dozen processes, easing system overhead and
+allowing the file and text tables to be made smaller,
+.IP \(bu 3
+servers need not contain the code required to handle connection
+queueing, simplifying the programs, and
+.IP \(bu 3
+installing and replacing servers becomes simpler.
+.PP
+With an increased numbers of networks, both local and external to Berkeley,
+we found that the overhead of the routing process was becoming
+inordinately high.
+Several changes were made in the routing daemon to reduce this load.
+Routes to external networks are no longer exchanged by routers
+on the internal machines, only a route to a default gateway.
+This reduces the amount of network traffic and the time required
+to process routing messages.
+In addition, the routing daemon was profiled
+and functions responsible for large amounts
+of time were optimized.
+The major changes were a faster hashing scheme,
+and inline expansions of the ubiquitous byte-swapping functions.
+.PP
+Under certain circumstances, when output was blocked,
+attempts by the remote login process
+to send output to the user were rejected by the system,
+although a prior \fIselect\fP call had indicated that data could be sent.
+This resulted in continuous attempts to write the data until the remote
+user restarted output.
+This problem was initially avoided in the remote login handler,
+and the original problem in the kernel has since been corrected.
+.NH 3
+The C Run-time Library
+.PP
+Several people have found poorly tuned code
+in frequently used routines in the C library [Lankford84].
+In particular the running time of the string routines can be
+cut in half by rewriting them using the VAX string instructions.
+The memory allocation routines have been tuned to waste less
+memory for memory allocations with sizes that are a power of two.
+Certain library routines that did file input in one-character reads
+have been corrected.
+Other library routines including \fIfread\fP and \fIfwrite\fP
+have been rewritten for efficiency.
+.NH 3
+Csh
+.PP
+The C-shell was converted to run on 4.2BSD by
+writing a set of routines to simulate the old jobs library.
+While this provided a functioning C-shell,
+it was grossly inefficient, generating up
+to twenty system calls per prompt.
+The C-shell has been modified to use the new signal
+facilities directly,
+cutting the number of system calls per prompt in half.
+Additional tuning was done with the help of profiling
+to cut the cost of frequently used facilities.
diff --git a/share/doc/papers/sysperf/5.t b/share/doc/papers/sysperf/5.t
new file mode 100644
index 000000000000..5d70a9a67368
--- /dev/null
+++ b/share/doc/papers/sysperf/5.t
@@ -0,0 +1,285 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)5.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Functional Extensions
+.NH
+Functional Extensions
+.PP
+Some of the facilities introduced in 4.2BSD were not completely
+implemented. An important part of the effort that went into
+4.3BSD was to clean up and unify both new and old facilities.
+.NH 2
+Kernel Extensions
+.PP
+A significant effort went into improving
+the networking part of the kernel.
+The work consisted of fixing bugs,
+tuning the algorithms,
+and revamping the lowest levels of the system
+to better handle heterogeneous network topologies.
+.NH 3
+Subnets, Broadcasts and Gateways
+.PP
+To allow sites to expand their network in an autonomous
+and orderly fashion, subnetworks have been introduced in 4.3BSD [GADS85].
+This facility allows sites to subdivide their local Internet address
+space into multiple subnetwork address spaces that are visible
+only by hosts at that site. To off-site hosts machines on a site's
+subnetworks appear to reside on a single network. The routing daemon
+has been reworked to provide routing support in this type of
+environment.
+.PP
+The default Internet broadcast address is now specified with a host part
+of all one's, rather than all zero's.
+The broadcast address may be set at boot time on a per-interface basis.
+.NH 3
+Interface Addressing
+.PP
+The organization of network interfaces has been
+reworked to more cleanly support multiple
+network protocols. Network interfaces no longer
+contain a host's address on that network; instead
+each interface contains a pointer to a list of addresses
+assigned to that interface. This permits a single
+interface to support, for example, Internet protocols
+at the same time as XNS protocols.
+.PP
+The Address Resolution Protocol (ARP) support
+for 10 megabyte/second Ethernet\(dg
+.FS
+\(dg Ethernet is a trademark of Xerox.
+.FE
+has been made more flexible by allowing hosts to
+act as an ``clearing house'' for hosts that do
+not support ARP. In addition, system managers have
+more control over the contents of the ARP translation
+cache and may interactively interrogate and modify
+the cache's contents.
+.NH 3
+User Control of Network Buffering
+.PP
+Although the system allocates reasonable default amounts of buffering
+for most connections, certain operations such as file system dumps
+to remote machines benefit from significant increases in buffering [Walsh84].
+The \fIsetsockopt\fP system call has been extended to allow such requests.
+In addition, \fIgetsockopt\fP and \fIsetsockopt\fP,
+are now interfaced to the protocol level allowing protocol-specific
+options to be manipulated by the user.
+.NH 3
+Number of File Descriptors
+.PP
+To allow full use of the many descriptor based services available,
+the previous hard limit of 30 open files per process has been relaxed.
+The changes entailed generalizing \fIselect\fP to handle arrays of
+32-bit words, removing the dependency on file descriptors from
+the page table entries,
+and limiting most of the linear scans of a process's file table.
+The default per-process descriptor limit was raised from 20 to 64,
+though there are no longer any hard upper limits on the number
+of file descriptors.
+.NH 3
+Kernel Limits
+.PP
+Many internal kernel configuration limits have been increased by suitable
+modifications to data structures.
+The limit on physical memory has been changed from 8 megabyte to 64 megabyte,
+and the limit of 15 mounted file systems has been changed to 255.
+The maximum file system size has been increased to 8 gigabyte,
+number of processes to 65536,
+and per process size to 64 megabyte of data and 64 megabyte of stack.
+Note that these are upper bounds,
+the default limits for these quantities are tuned for systems
+with 4-8 megabyte of physical memory.
+.NH 3
+Memory Management
+.PP
+The global clock page replacement algorithm used to have a single
+hand that was used both to mark and to reclaim memory.
+The first time that it encountered a page it would clear its reference bit.
+If the reference bit was still clear on its next pass across the page,
+it would reclaim the page.
+The use of a single hand does not work well with large physical
+memories as the time to complete a single revolution of the hand
+can take up to a minute or more.
+By the time the hand gets around to the marked pages,
+the information is usually no longer pertinent.
+During periods of sudden shortages,
+the page daemon will not be able to find any reclaimable pages until
+it has completed a full revolution.
+To alleviate this problem,
+the clock hand has been split into two separate hands.
+The front hand clears the reference bits,
+the back hand follows a constant number of pages behind
+reclaiming pages that still have cleared reference bits.
+While the code has been written to allow the distance between
+the hands to be varied, we have not found any algorithms
+suitable for determining how to dynamically adjust this distance.
+.PP
+The configuration of the virtual memory system used to require
+a significant understanding of its operation to do such
+simple tasks as increasing the maximum process size.
+This process has been significantly improved so that the most
+common configuration parameters, such as the virtual memory sizes,
+can be specified using a single option in the configuration file.
+Standard configurations support data and stack segments
+of 17, 33 and 64 megabytes.
+.NH 3
+Signals
+.PP
+The 4.2BSD signal implementation would push several words
+onto the normal run-time stack before switching to an
+alternate signal stack.
+The 4.3BSD implementation has been corrected so that
+the entire signal handler's state is now pushed onto the signal stack.
+Another limitation in the original signal implementation was
+that it used an undocumented system call to return from signals.
+Users could not write their own return from exceptions;
+4.3BSD formally specifies the \fIsigreturn\fP system call.
+.PP
+Many existing programs depend on interrupted system calls.
+The restartable system call semantics of 4.2BSD signals caused
+many of these programs to break.
+To simplify porting of programs from inferior versions of
+.UX
+the \fIsigvec\fP system call has been extended so that
+programmers may specify that system calls are not to be
+restarted after particular signals.
+.NH 3
+System Logging
+.PP
+A system logging facility has been added
+that sends kernel messages to the
+syslog daemon for logging in /usr/adm/messages and possibly for
+printing on the system console.
+The revised scheme for logging messages
+eliminates the time lag in updating the messages file,
+unifies the format of kernel messages,
+provides a finer granularity of control over the messages
+that get printed on the console,
+and eliminates the degradation in response during the printing of
+low-priority kernel messages.
+Recoverable system errors and common resource limitations are logged
+using this facility.
+Most system utilities such as init and login,
+have been modified to log errors to syslog
+rather than writing directly on the console.
+.NH 3
+Windows
+.PP
+The tty structure has been augmented to hold
+information about the size
+of an associated window or terminal.
+These sizes can be obtained by programs such as editors that want
+to know the size of the screen they are manipulating.
+When these sizes are changed,
+a new signal, SIGWINCH, is sent the current process group.
+The editors have been modified to catch this signal and reshape
+their view of the world, and the remote login program and server
+now cooperate to propagate window sizes and window size changes
+across a network.
+Other programs and libraries such as curses that need the width
+or height of the screen have been modified to use this facility as well.
+.NH 3
+Configuration of UNIBUS Devices
+.PP
+The UNIBUS configuration routines have been extended to allow auto-configuration
+of dedicated UNIBUS memory held by devices.
+The new routines simplify the configuration of memory-mapped devices
+and correct problems occurring on reset of the UNIBUS.
+.NH 3
+Disk Recovery from Errors
+.PP
+The MASSBUS disk driver's error recovery routines have been fixed to
+retry before correcting ECC errors, support ECC on bad-sector replacements,
+and correctly attempt retries after earlier
+corrective actions in the same transfer.
+The error messages are more accurate.
+.NH 2
+Functional Extensions to Libraries and Utilities
+.PP
+Most of the changes to the utilities and libraries have been to
+allow them to handle a more general set of problems,
+or to handle the same set of problems more quickly.
+.NH 3
+Name Server
+.PP
+In 4.2BSD the name resolution routines (\fIgethostbyname\fP,
+\fIgetservbyname\fP,
+etc.) were implemented by a set of database files maintained on the
+local machine.
+Inconsistencies or obsolescence in these files resulted in inaccessibility of
+hosts or services.
+In 4.3BSD these files may be replaced by a network name server that can
+insure a consistent view of the name space in a multimachine environment.
+This name server operates in accordance with Internet standards
+for service on the ARPANET [Mockapetris83].
+.NH 3
+System Management
+.PP
+A new utility, \fIrdist\fP,
+has been provided to assist system managers in keeping
+all their machines up to date with a consistent set of sources and binaries.
+A master set of sources may reside on a single central machine,
+or be distributed at (known) locations throughout the environment.
+New versions of \fIgetty\fP, \fIinit\fP, and \fIlogin\fP
+merge the functions of several
+files into a single place, and allow more flexibility in the
+startup of processes such as window managers.
+.PP
+The new utility \fItimed\fP keeps the time on a group of cooperating machines
+(within a single LAN) synchronized to within 30 milliseconds.
+It does its corrections using a new system call that changes
+the rate of time advance without stopping or reversing the system clock.
+It normally selects one machine to act as a master.
+If the master dies or is partitioned, a new master is elected.
+Other machines may participate in a purely slave role.
+.NH 3
+Routing
+.PP
+Many bugs in the routing daemon have been fixed;
+it is considerably more robust,
+and now understands how to properly deal with
+subnets and point-to-point networks.
+Its operation has been made more efficient by tuning with the use
+of execution profiles, along with inline expansion of common operations
+using the kernel's \fIinline\fP optimizer.
+.NH 3
+Compilers
+.PP
+The symbolic debugger \fIdbx\fP has had many new features added,
+and all the known bugs fixed. In addition \fIdbx\fP
+has been extended to work with the Pascal compiler.
+The fortran compiler \fIf77\fP has had numerous bugs fixed.
+The C compiler has been modified so that it can, optionally,
+generate single precision floating point instructions when operating
+on single precision variables.
diff --git a/share/doc/papers/sysperf/6.t b/share/doc/papers/sysperf/6.t
new file mode 100644
index 000000000000..a445ee19ff5a
--- /dev/null
+++ b/share/doc/papers/sysperf/6.t
@@ -0,0 +1,70 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)6.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Security Tightening
+.NH
+Security Tightening
+.PP
+Since we do not wish to encourage rampant system cracking,
+we describe only briefly the changes made to enhance security.
+.NH 2
+Generic Kernel
+.PP
+Several loopholes in the process tracing facility have been corrected.
+Programs being traced may not be executed;
+executing programs may not be traced.
+Programs may not provide input to terminals to which they do not
+have read permission.
+The handling of process groups has been tightened to eliminate
+some problems.
+When a program attempts to change its process group,
+the system checks to see if the process with the pid of the process
+group was started by the same user.
+If it exists and was started by a different user the process group
+number change is denied.
+.NH 2
+Security Problems in Utilities
+.PP
+Setuid utilities no longer use the \fIpopen\fP or \fIsystem\fP library routines.
+Access to the kernel's data structures through the kmem device
+is now restricted to programs that are set group id ``kmem''.
+Thus many programs that used to run with root privileges
+no longer need to do so.
+Access to disk devices is now controlled by an ``operator'' group id;
+this permission allows operators to function without being the super-user.
+Only users in group wheel can do ``su root''; this restriction
+allows administrators to define a super-user access list.
+Numerous holes have been closed in the shell to prevent
+users from gaining privileges from set user id shell scripts,
+although use of such scripts is still highly discouraged on systems
+that are concerned about security.
diff --git a/share/doc/papers/sysperf/7.t b/share/doc/papers/sysperf/7.t
new file mode 100644
index 000000000000..68f5717f5c09
--- /dev/null
+++ b/share/doc/papers/sysperf/7.t
@@ -0,0 +1,164 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)7.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Conclusions
+.NH
+Conclusions
+.PP
+4.2BSD, while functionally superior to 4.1BSD, lacked much of the
+performance tuning required of a good system. We found that
+the distributed system spent 10-20% more time in the kernel than
+4.1BSD. This added overhead combined with problems with several
+user programs severely limited the overall performance of the
+system in a general timesharing environment.
+.PP
+Changes made to the system since the 4.2BSD distribution have
+eliminated most of the
+added system overhead by replacing old algorithms
+or introducing additional cacheing schemes.
+The combined caches added to the name translation process
+reduce the average cost of translating a pathname to an inode by more than 50%.
+These changes reduce the percentage of time spent running
+in the system by nearly 9%.
+.PP
+The use of silo input on terminal ports only when necessary
+has allowed the system to avoid a large amount of software interrupt
+processing. Observations show that the system is forced to
+field about 25% fewer interrupts than before.
+.PP
+The kernel
+changes, combined with many bug fixes, make the system much more
+responsive in a general timesharing environment.
+The 4.3BSD Berkeley UNIX system now appears
+capable of supporting loads at least as large as those supported under
+4.1BSD while providing all the new interprocess communication, networking,
+and file system facilities.
+.nr H2 1
+.ds RH Acknowledgements
+.SH
+\s+2Acknowledgements\s0
+.PP
+We would like to thank Robert Elz for sharing his ideas and
+his code for cacheing system wide names and searching the process table.
+We thank Alan Smith for initially suggesting the use of a
+capability based cache.
+We also acknowledge
+George Goble who dropped many of our changes
+into his production system and reported back fixes to the
+disasters that they caused.
+The buffer cache read-ahead trace package was based
+on a program written by Jim Lawson. Ralph Campbell
+implemented several of the C library changes. The original
+version of the Internet daemon was written by Bill Joy.
+In addition,
+we would like to thank the many other people that contributed
+ideas, information, and work while the system was undergoing change.
+.ds RH References
+.nr H2 1
+.sp 2
+.SH
+\s+2References\s-2
+.LP
+.IP [Cabrera84] 20
+Luis Felipe Cabrera, Eduard Hunter, Michael J. Karels, and David Mosher,
+``A User-Process Oriented Performance Study of Ethernet Networking Under
+Berkeley UNIX 4.2BSD,''
+Research Report No. UCB/CSD 84/217, University of California,
+Berkeley, December 1984.
+.IP [Cabrera85] 20
+Luis Felipe Cabrera, Michael J. Karels, and David Mosher,
+``The Impact of Buffer Management on Networking Software Performance
+in Berkeley UNIX 4.2BSD: A Case Study,''
+Proceedings of the Summer Usenix Conference, Portland, Oregon,
+June 1985, pp. 507-517.
+.IP [GADS85] 20
+GADS (Gateway Algorithms and Data Structures Task Force),
+``Toward an Internet Standard for Subnetting,'' RFC-940,
+Network Information Center, SRI International,
+April 1985.
+.IP [Joy80] 20
+Joy, William,
+``Comments on the performance of UNIX on the VAX'',
+Computer System Research Group, U.C. Berkeley.
+April 1980.
+.IP [Kashtan80] 20
+Kashtan, David L.,
+``UNIX and VMS, Some Performance Comparisons'',
+SRI International. February 1980.
+.IP [Lankford84] 20
+Jeffrey Lankford,
+``UNIX System V and 4BSD Performance,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 228-236, June 1984.
+.IP [Leffler84] 20
+Sam Leffler, Mike Karels, and M. Kirk McKusick,
+``Measuring and Improving the Performance of 4.2BSD,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 237-252, June 1984.
+.IP [McKusick85]
+M. Kirk McKusick, Mike Karels, and Samual Leffler,
+``Performance Improvements and Functional Enhancements in 4.3BSD''
+\fIProceedings of the Portland Usenix Conference\fP,
+pp 519-531, June 1985.
+.IP [Mockapetris83] 20
+Paul Mockapetris, ``Domain Names \- Implementation and Schedule,''
+Network Information Center, SRI International,
+RFC-883,
+November 1983.
+.IP [Mogul84] 20
+Jeffrey Mogul, ``Broadcasting Internet Datagrams,'' RFC-919,
+Network Information Center, SRI International,
+October 1984.
+.IP [Mosher80] 20
+Mosher, David,
+``UNIX Performance, an Introspection'',
+Presented at the Boulder, Colorado Usenix Conference, January 1980.
+Copies of the paper are available from
+Computer System Research Group, U.C. Berkeley.
+.IP [Nagle84] 20
+John Nagle, ``Congestion Control in IP/TCP Internetworks,'' RFC-896,
+Network Information Center, SRI International,
+January 1984.
+.IP [Ritchie74] 20
+Ritchie, D. M. and Thompson, K.,
+``The UNIX Time-Sharing System'',
+CACM 17, 7. July 1974. pp 365-375
+.IP [Shannon83] 20
+Shannon, W.,
+private communication,
+July 1983
+.IP [Walsh84] 20
+Robert Walsh and Robert Gurwitz,
+``Converting BBN TCP/IP to 4.2BSD,''
+\fIProceedings of the Salt Lake City Usenix Conference\fP,
+pp 52-61, June 1984.
diff --git a/share/doc/papers/sysperf/Makefile b/share/doc/papers/sysperf/Makefile
new file mode 100644
index 000000000000..b65852bff4f4
--- /dev/null
+++ b/share/doc/papers/sysperf/Makefile
@@ -0,0 +1,22 @@
+# @(#)Makefile 1.6 (Berkeley) 6/8/93
+
+DIR= papers/sysperf
+MACROS= -ms
+SRCS= 0.t 1.t 2.t 3.t 4.t 5.t 6.t 7.t
+EXTRA= a1.t a2.t
+OBJS= paper.tmp appendix.tmp
+CLEANFILES+=${OBJS}
+
+paper.ps: ${OBJS}
+ ${ROFF} ${OBJS} > ${.TARGET}
+
+paper.tmp: ${SRCS}
+ ${TBL} ${SRCS} | ${EQN} > paper.tmp
+
+appendix.tmp: a1.t a2.t
+ ${GRIND} -f a1.t | awk '/\.\(\)/{ cnt = 2 } \
+ { if (cnt) cnt -= 1; else print $$0; } ' > appendix.tmp
+ ${GRIND} -f -lcsh a2.t | awk '/\.\(\)/{ cnt = 2 } \
+ { if (cnt) cnt -= 1; else print $$0; } ' >> appendix.tmp
+
+.include <bsd.doc.mk>
diff --git a/share/doc/papers/sysperf/a1.t b/share/doc/papers/sysperf/a1.t
new file mode 100644
index 000000000000..b94f6aa1e2c9
--- /dev/null
+++ b/share/doc/papers/sysperf/a1.t
@@ -0,0 +1,668 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)a1.t 5.1 (Berkeley) 4/17/91
+.\"
+.ds RH Appendix A \- Benchmark sources
+.nr H2 1
+.sp 2
+.de vS
+.nf
+..
+.de vE
+.fi
+..
+.bp
+.SH
+\s+2Appendix A \- Benchmark sources\s-2
+.LP
+The programs shown here run under 4.2 with only routines
+from the standard libraries. When run under 4.1 they were augmented
+with a \fIgetpagesize\fP routine and a copy of the \fIrandom\fP
+function from the C library. The \fIvforks\fP and \fIvexecs\fP
+programs are constructed from the \fIforks\fP and \fIexecs\fP programs,
+respectively, by substituting calls to \fIfork\fP with calls to
+\fIvfork\fP.
+.SH
+syscall
+.LP
+.vS
+/*
+ * System call overhead benchmark.
+ */
+main(argc, argv)
+ char *argv[];
+{
+ register int ncalls;
+
+ if (argc < 2) {
+ printf("usage: %s #syscalls\n", argv[0]);
+ exit(1);
+ }
+ ncalls = atoi(argv[1]);
+ while (ncalls-- > 0)
+ (void) getpid();
+}
+.vE
+.SH
+csw
+.LP
+.vS
+/*
+ * Context switching benchmark.
+ *
+ * Force system to context switch 2*nsigs
+ * times by forking and exchanging signals.
+ * To calculate system overhead for a context
+ * switch, the signocsw program must be run
+ * with nsigs. Overhead is then estimated by
+ * t1 = time csw <n>
+ * t2 = time signocsw <n>
+ * overhead = t1 - 2 * t2;
+ */
+#include <signal.h>
+
+int sigsub();
+int otherpid;
+int nsigs;
+
+main(argc, argv)
+ char *argv[];
+{
+ int pid;
+
+ if (argc < 2) {
+ printf("usage: %s nsignals\n", argv[0]);
+ exit(1);
+ }
+ nsigs = atoi(argv[1]);
+ signal(SIGALRM, sigsub);
+ otherpid = getpid();
+ pid = fork();
+ if (pid != 0) {
+ otherpid = pid;
+ kill(otherpid, SIGALRM);
+ }
+ for (;;)
+ sigpause(0);
+}
+
+sigsub()
+{
+
+ signal(SIGALRM, sigsub);
+ kill(otherpid, SIGALRM);
+ if (--nsigs <= 0)
+ exit(0);
+}
+.vE
+.SH
+signocsw
+.LP
+.vS
+/*
+ * Signal without context switch benchmark.
+ */
+#include <signal.h>
+
+int pid;
+int nsigs;
+int sigsub();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int i;
+
+ if (argc < 2) {
+ printf("usage: %s nsignals\n", argv[0]);
+ exit(1);
+ }
+ nsigs = atoi(argv[1]);
+ signal(SIGALRM, sigsub);
+ pid = getpid();
+ for (i = 0; i < nsigs; i++)
+ kill(pid, SIGALRM);
+}
+
+sigsub()
+{
+
+ signal(SIGALRM, sigsub);
+}
+.vE
+.SH
+pipeself
+.LP
+.vS
+/*
+ * IPC benchmark,
+ * write to self using pipes.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ for (i = 0; i < iter; i++) {
+ write(fd[1], buf, msgsize);
+ read(fd[0], buf, msgsize);
+ }
+}
+.vE
+.SH
+pipediscard
+.LP
+.vS
+/*
+ * IPC benchmarkl,
+ * write and discard using pipes.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (fork() == 0)
+ for (i = 0; i < iter; i++)
+ read(fd[0], buf, msgsize);
+ else
+ for (i = 0; i < iter; i++)
+ write(fd[1], buf, msgsize);
+}
+.vE
+.SH
+pipeback
+.LP
+.vS
+/*
+ * IPC benchmark,
+ * read and reply using pipes.
+ *
+ * Process forks and exchanges messages
+ * over a pipe in a request-response fashion.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ char buf[512];
+ int fd[2], fd2[2], msgsize;
+ register int i, iter;
+
+ if (argc < 3) {
+ printf("usage: %s iterations message-size\n", argv[0]);
+ exit(1);
+ }
+ argc--, argv++;
+ iter = atoi(*argv);
+ argc--, argv++;
+ msgsize = atoi(*argv);
+ if (msgsize > sizeof (buf) || msgsize <= 0) {
+ printf("%s: Bad message size.\n", *argv);
+ exit(2);
+ }
+ if (pipe(fd) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (pipe(fd2) < 0) {
+ perror("pipe");
+ exit(3);
+ }
+ if (fork() == 0)
+ for (i = 0; i < iter; i++) {
+ read(fd[0], buf, msgsize);
+ write(fd2[1], buf, msgsize);
+ }
+ else
+ for (i = 0; i < iter; i++) {
+ write(fd[1], buf, msgsize);
+ read(fd2[0], buf, msgsize);
+ }
+}
+.vE
+.SH
+forks
+.LP
+.vS
+/*
+ * Benchmark program to calculate fork+wait
+ * overhead (approximately). Process
+ * forks and exits while parent waits.
+ * The time to run this program is used
+ * in calculating exec overhead.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ register int nforks, i;
+ char *cp;
+ int pid, child, status, brksize;
+
+ if (argc < 2) {
+ printf("usage: %s number-of-forks sbrk-size\n", argv[0]);
+ exit(1);
+ }
+ nforks = atoi(argv[1]);
+ if (nforks < 0) {
+ printf("%s: bad number of forks\n", argv[1]);
+ exit(2);
+ }
+ brksize = atoi(argv[2]);
+ if (brksize < 0) {
+ printf("%s: bad size to sbrk\n", argv[2]);
+ exit(3);
+ }
+ cp = (char *)sbrk(brksize);
+ if ((int)cp == -1) {
+ perror("sbrk");
+ exit(4);
+ }
+ for (i = 0; i < brksize; i += 1024)
+ cp[i] = i;
+ while (nforks-- > 0) {
+ child = fork();
+ if (child == -1) {
+ perror("fork");
+ exit(-1);
+ }
+ if (child == 0)
+ _exit(-1);
+ while ((pid = wait(&status)) != -1 && pid != child)
+ ;
+ }
+ exit(0);
+}
+.vE
+.SH
+execs
+.LP
+.vS
+/*
+ * Benchmark program to calculate exec
+ * overhead (approximately). Process
+ * forks and execs "null" test program.
+ * The time to run the fork program should
+ * then be deducted from this one to
+ * estimate the overhead for the exec.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+ register int nexecs, i;
+ char *cp, *sbrk();
+ int pid, child, status, brksize;
+
+ if (argc < 3) {
+ printf("usage: %s number-of-execs sbrk-size job-name\n",
+ argv[0]);
+ exit(1);
+ }
+ nexecs = atoi(argv[1]);
+ if (nexecs < 0) {
+ printf("%s: bad number of execs\n", argv[1]);
+ exit(2);
+ }
+ brksize = atoi(argv[2]);
+ if (brksize < 0) {
+ printf("%s: bad size to sbrk\n", argv[2]);
+ exit(3);
+ }
+ cp = sbrk(brksize);
+ if ((int)cp == -1) {
+ perror("sbrk");
+ exit(4);
+ }
+ for (i = 0; i < brksize; i += 1024)
+ cp[i] = i;
+ while (nexecs-- > 0) {
+ child = fork();
+ if (child == -1) {
+ perror("fork");
+ exit(-1);
+ }
+ if (child == 0) {
+ execv(argv[3], argv);
+ perror("execv");
+ _exit(-1);
+ }
+ while ((pid = wait(&status)) != -1 && pid != child)
+ ;
+ }
+ exit(0);
+}
+.vE
+.SH
+nulljob
+.LP
+.vS
+/*
+ * Benchmark "null job" program.
+ */
+
+main(argc, argv)
+ char *argv[];
+{
+
+ exit(0);
+}
+.vE
+.SH
+bigjob
+.LP
+.vS
+/*
+ * Benchmark "null big job" program.
+ */
+/* 250 here is intended to approximate vi's text+data size */
+char space[1024 * 250] = "force into data segment";
+
+main(argc, argv)
+ char *argv[];
+{
+
+ exit(0);
+}
+.vE
+.bp
+.SH
+seqpage
+.LP
+.vS
+/*
+ * Sequential page access benchmark.
+ */
+#include <sys/vadvise.h>
+
+char *valloc();
+
+main(argc, argv)
+ char *argv[];
+{
+ register i, niter;
+ register char *pf, *lastpage;
+ int npages = 4096, pagesize, vflag = 0;
+ char *pages, *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf("usage: %s [ -v ] [ -p #pages ] niter\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-v") == 0) {
+ argc--, argv++;
+ vflag++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages * pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages * pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ lastpage = pages + (npages * pagesize);
+ if (vflag)
+ vadvise(VA_SEQL);
+ for (i = 0; i < niter; i++)
+ for (pf = pages; pf < lastpage; pf += pagesize)
+ *pf = 1;
+}
+.vE
+.SH
+randpage
+.LP
+.vS
+/*
+ * Random page access benchmark.
+ */
+#include <sys/vadvise.h>
+
+char *valloc();
+int rand();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int npages = 4096, pagesize, pn, i, niter;
+ int vflag = 0, debug = 0;
+ char *pages, *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf("usage: %s [ -d ] [ -v ] [ -p #pages ] niter\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-v") == 0) {
+ argc--, argv++;
+ vflag++;
+ goto again;
+ }
+ if (strcmp(*argv, "-d") == 0) {
+ argc--, argv++;
+ debug++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages * pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages * pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ if (vflag)
+ vadvise(VA_ANOM);
+ for (i = 0; i < niter; i++) {
+ pn = random() % npages;
+ if (debug)
+ printf("touch page %d\n", pn);
+ pages[pagesize * pn] = 1;
+ }
+}
+.vE
+.SH
+gausspage
+.LP
+.vS
+/*
+ * Random page access with
+ * a gaussian distribution.
+ *
+ * Allocate a large (zero fill on demand) address
+ * space and fault the pages in a random gaussian
+ * order.
+ */
+
+float sqrt(), log(), rnd(), cos(), gauss();
+char *valloc();
+int rand();
+
+main(argc, argv)
+ char *argv[];
+{
+ register int pn, i, niter, delta;
+ register char *pages;
+ float sd = 10.0;
+ int npages = 4096, pagesize, debug = 0;
+ char *name;
+
+ name = argv[0];
+ argc--, argv++;
+again:
+ if (argc < 1) {
+usage:
+ printf(
+"usage: %s [ -d ] [ -p #pages ] [ -s standard-deviation ] iterations\n", name);
+ exit(1);
+ }
+ if (strcmp(*argv, "-s") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ sscanf(*argv, "%f", &sd);
+ if (sd <= 0) {
+ printf("%s: Bad standard deviation.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-p") == 0) {
+ argc--, argv++;
+ if (argc < 1)
+ goto usage;
+ npages = atoi(*argv);
+ if (npages <= 0) {
+ printf("%s: Bad page count.\n", *argv);
+ exit(2);
+ }
+ argc--, argv++;
+ goto again;
+ }
+ if (strcmp(*argv, "-d") == 0) {
+ argc--, argv++;
+ debug++;
+ goto again;
+ }
+ niter = atoi(*argv);
+ pagesize = getpagesize();
+ pages = valloc(npages*pagesize);
+ if (pages == (char *)0) {
+ printf("Can't allocate %d pages (%2.1f megabytes).\n",
+ npages, (npages*pagesize) / (1024. * 1024.));
+ exit(3);
+ }
+ pn = 0;
+ for (i = 0; i < niter; i++) {
+ delta = gauss(sd, 0.0);
+ while (pn + delta < 0 || pn + delta > npages)
+ delta = gauss(sd, 0.0);
+ pn += delta;
+ if (debug)
+ printf("touch page %d\n", pn);
+ else
+ pages[pn * pagesize] = 1;
+ }
+}
+
+float
+gauss(sd, mean)
+ float sd, mean;
+{
+ register float qa, qb;
+
+ qa = sqrt(log(rnd()) * -2.0);
+ qb = 3.14159 * rnd();
+ return (qa * cos(qb) * sd + mean);
+}
+
+float
+rnd()
+{
+ static int seed = 1;
+ static int biggest = 0x7fffffff;
+
+ return ((float)rand(seed) / (float)biggest);
+}
+.vE
diff --git a/share/doc/papers/sysperf/a2.t b/share/doc/papers/sysperf/a2.t
new file mode 100644
index 000000000000..e1882cf28ba4
--- /dev/null
+++ b/share/doc/papers/sysperf/a2.t
@@ -0,0 +1,117 @@
+.\" Copyright (c) 1985 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\" must display the following acknowledgement:
+.\" This product includes software developed by the University of
+.\" California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)a2.t 5.1 (Berkeley) 4/17/91
+.\"
+.SH
+run (shell script)
+.LP
+.vS
+#! /bin/csh -fx
+# Script to run benchmark programs.
+#
+date
+make clean; time make
+time syscall 100000
+time seqpage -p 7500 10
+time seqpage -v -p 7500 10
+time randpage -p 7500 30000
+time randpage -v -p 7500 30000
+time gausspage -p 7500 -s 1 30000
+time gausspage -p 7500 -s 10 30000
+time gausspage -p 7500 -s 30 30000
+time gausspage -p 7500 -s 40 30000
+time gausspage -p 7500 -s 50 30000
+time gausspage -p 7500 -s 60 30000
+time gausspage -p 7500 -s 80 30000
+time gausspage -p 7500 -s 10000 30000
+time csw 10000
+time signocsw 10000
+time pipeself 10000 512
+time pipeself 10000 4
+time udgself 10000 512
+time udgself 10000 4
+time pipediscard 10000 512
+time pipediscard 10000 4
+time udgdiscard 10000 512
+time udgdiscard 10000 4
+time pipeback 10000 512
+time pipeback 10000 4
+time udgback 10000 512
+time udgback 10000 4
+size forks
+time forks 1000 0
+time forks 1000 1024
+time forks 1000 102400
+size vforks
+time vforks 1000 0
+time vforks 1000 1024
+time vforks 1000 102400
+countenv
+size nulljob
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time vexecs 1000 0 nulljob
+time vexecs 1000 1024 nulljob
+time vexecs 1000 102400 nulljob
+size bigjob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+time vexecs 1000 0 bigjob
+time vexecs 1000 1024 bigjob
+time vexecs 1000 102400 bigjob
+# fill environment with ~1024 bytes
+setenv a 012345678901234567890123456789012345678901234567890123456780123456789
+setenv b 012345678901234567890123456789012345678901234567890123456780123456789
+setenv c 012345678901234567890123456789012345678901234567890123456780123456789
+setenv d 012345678901234567890123456789012345678901234567890123456780123456789
+setenv e 012345678901234567890123456789012345678901234567890123456780123456789
+setenv f 012345678901234567890123456789012345678901234567890123456780123456789
+setenv g 012345678901234567890123456789012345678901234567890123456780123456789
+setenv h 012345678901234567890123456789012345678901234567890123456780123456789
+setenv i 012345678901234567890123456789012345678901234567890123456780123456789
+setenv j 012345678901234567890123456789012345678901234567890123456780123456789
+setenv k 012345678901234567890123456789012345678901234567890123456780123456789
+setenv l 012345678901234567890123456789012345678901234567890123456780123456789
+setenv m 012345678901234567890123456789012345678901234567890123456780123456789
+setenv n 012345678901234567890123456789012345678901234567890123456780123456789
+setenv o 012345678901234567890123456789012345678901234567890123456780123456789
+countenv
+time execs 1000 0 nulljob
+time execs 1000 1024 nulljob
+time execs 1000 102400 nulljob
+time execs 1000 0 bigjob
+time execs 1000 1024 bigjob
+time execs 1000 102400 bigjob
+.vE
+.bp