From 76193d7cb0457807b2f0b95f909ab5de19480cd7 Mon Sep 17 00:00:00 2001
From: rsc <devnull@localhost>
Date: Tue, 30 Sep 2003 17:47:42 +0000
Subject: Initial revision

---
 man/man7/regexp9.7 | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man7/utf.7     |  91 ++++++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 man/man7/regexp9.7
 create mode 100644 man/man7/utf.7

(limited to 'man/man7')

diff --git a/man/man7/regexp9.7 b/man/man7/regexp9.7
new file mode 100644
index 00000000..14a90d0f
--- /dev/null
+++ b/man/man7/regexp9.7
@@ -0,0 +1,150 @@
+.TH REGEXP9 7
+.de EX
+.nf
+.ft B
+..
+.de EE
+.fi
+.ft R
+..
+.de LR
+.if t .BR \\$1 \\$2
+.if n .RB ` \\$1 '\\$2
+..
+.de L
+.nh
+.if t .B \\$1
+.if n .RB ` \\$1 '
+..
+.SH NAME
+regexp9 \- Plan 9 regular expression notation
+.SH DESCRIPTION
+This manual page describes the regular expression
+syntax used by the Plan 9 regular expression library
+.IR regexp9 (3).
+It is the form used by
+.IR egrep (1)
+before
+.I egrep
+got complicated.
+.PP
+A 
+.I "regular expression"
+specifies
+a set of strings of characters.
+A member of this set of strings is said to be
+.I matched
+by the regular expression.  In many applications
+a delimiter character, commonly
+.LR / ,
+bounds a regular expression.
+In the following specification for regular expressions
+the word `character' means any character (rune) but newline.
+.PP
+The syntax for a regular expression
+.B e0
+is
+.IP
+.EX
+e3:  literal | charclass | '.' | '^' | '$' | '(' e0 ')'
+
+e2:  e3
+  |  e2 REP
+
+REP: '*' | '+' | '?'
+
+e1:  e2
+  |  e1 e2
+
+e0:  e1
+  |  e0 '|' e1
+.EE
+.PP
+A
+.B literal
+is any non-metacharacter, or a metacharacter
+(one of
+.BR .*+?[]()|\e^$ ),
+or the delimiter
+preceded by 
+.LR \e .
+.PP
+A
+.B charclass
+is a nonempty string
+.I s
+bracketed
+.BI [ \|s\| ]
+(or
+.BI [^ s\| ]\fR);
+it matches any character in (or not in)
+.IR s .
+A negated character class never
+matches newline.
+A substring 
+.IB a - b\f1,
+with
+.I a
+and
+.I b
+in ascending
+order, stands for the inclusive
+range of
+characters between
+.I a
+and
+.IR b .
+In 
+.IR s ,
+the metacharacters
+.LR - ,
+.LR ] ,
+an initial
+.LR ^ ,
+and the regular expression delimiter
+must be preceded by a
+.LR \e ;
+other metacharacters 
+have no special meaning and
+may appear unescaped.
+.PP
+A 
+.L .
+matches any character.
+.PP
+A
+.L ^
+matches the beginning of a line;
+.L $
+matches the end of the line.
+.PP
+The 
+.B REP
+operators match zero or more
+.RB ( * ),
+one or more
+.RB ( + ),
+zero or one
+.RB ( ? ),
+instances respectively of the preceding regular expression 
+.BR e2 .
+.PP
+A concatenated regular expression,
+.BR "e1\|e2" ,
+matches a match to 
+.B e1
+followed by a match to
+.BR e2 .
+.PP
+An alternative regular expression,
+.BR "e0\||\|e1" ,
+matches either a match to
+.B e0
+or a match to
+.BR e1 .
+.PP
+A match to any part of a regular expression
+extends as far as possible without preventing
+a match to the remainder of the regular expression.
+.SH "SEE ALSO"
+.IR regexp9 (3)
diff --git a/man/man7/utf.7 b/man/man7/utf.7
new file mode 100644
index 00000000..97b7b1e7
--- /dev/null
+++ b/man/man7/utf.7
@@ -0,0 +1,91 @@
+.TH UTF 7
+.SH NAME
+UTF, Unicode, ASCII, rune \- character set and format
+.SH DESCRIPTION
+The Plan 9 character set and representation are
+based on the Unicode Standard and on the ISO multibyte
+.SM UTF-8
+encoding (Universal Character
+Set Transformation Format, 8 bits wide).
+The Unicode Standard represents its characters in 16
+bits;
+.SM UTF-8
+represents such
+values in an 8-bit byte stream.
+Throughout this manual,
+.SM UTF-8
+is shortened to
+.SM UTF.
+.PP
+In Plan 9, a
+.I rune
+is a 16-bit quantity representing a Unicode character.
+Internally, programs may store characters as runes.
+However, any external manifestation of textual information,
+in files or at the interface between programs, uses a
+machine-independent, byte-stream encoding called
+.SM UTF.
+.PP
+.SM UTF
+is designed so the 7-bit
+.SM ASCII
+set (values hexadecimal 00 to 7F),
+appear only as themselves
+in the encoding.
+Runes with values above 7F appear as sequences of two or more
+bytes with values only from 80 to FF.
+.PP
+The
+.SM UTF
+encoding of the Unicode Standard is backward compatible with
+.SM ASCII\c
+:
+programs presented only with
+.SM ASCII
+work on Plan 9
+even if not written to deal with
+.SM UTF,
+as do
+programs that deal with uninterpreted byte streams.
+However, programs that perform semantic processing on
+.SM ASCII
+graphic
+characters must convert from
+.SM UTF
+to runes
+in order to work properly with non-\c
+.SM ASCII
+input.
+See
+.IR rune (2).
+.PP
+Letting numbers be binary,
+a rune x is converted to a multibyte
+.SM UTF
+sequence
+as follows:
+.PP
+01.   x in [00000000.0bbbbbbb] → 0bbbbbbb
+.br
+10.   x in [00000bbb.bbbbbbbb] → 110bbbbb, 10bbbbbb
+.br
+11.   x in [bbbbbbbb.bbbbbbbb] → 1110bbbb, 10bbbbbb, 10bbbbbb
+.br
+.PP
+Conversion 01 provides a one-byte sequence that spans the
+.SM ASCII
+character set in a compatible way.
+Conversions 10 and 11 represent higher-valued characters
+as sequences of two or three bytes with the high bit set.
+Plan 9 does not support the 4, 5, and 6 byte sequences proposed by X-Open.
+When there are multiple ways to encode a value, for example rune 0,
+the shortest encoding is used.
+.PP
+In the inverse mapping,
+any sequence except those described above
+is incorrect and is converted to rune hexadecimal 0080.
+.SH "SEE ALSO"
+.IR ascii (1),
+.IR tcs (1),
+.IR rune (3),
+.IR "The Unicode Standard" .
-- 
cgit v1.2.3