diff options
Diffstat (limited to 'contrib/file/ascmagic.c')
-rw-r--r-- | contrib/file/ascmagic.c | 248 |
1 files changed, 139 insertions, 109 deletions
diff --git a/contrib/file/ascmagic.c b/contrib/file/ascmagic.c index 8217f19422f6..3df6aaae7e35 100644 --- a/contrib/file/ascmagic.c +++ b/contrib/file/ascmagic.c @@ -1,10 +1,39 @@ /* + * Copyright (c) Ian F. Darwin 1986-1995. + * Software written by Ian F. Darwin and others; + * maintained 1995-present by Christos Zoulas and others. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Ian F. Darwin and others. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* * ASCII magic -- file types that we know based on keywords * that can appear anywhere in the file. * - * Copyright (c) Ian F. Darwin, 1987. - * Written by Ian F. Darwin. - * * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, * to handle character codes other than ASCII on a unified basis. * @@ -12,29 +41,9 @@ * international characters, now subsumed into this file. */ -/* - * This software is not subject to any license of the American Telephone - * and Telegraph Company or of the Regents of the University of California. - * - * Permission is granted to anyone to use this software for any purpose on - * any computer system, and to alter it and redistribute it freely, subject - * to the following restrictions: - * - * 1. The author is not responsible for the consequences of use of this - * software, no matter how awful, even if they arise from flaws in it. - * - * 2. The origin of this software must not be misrepresented, either by - * explicit claim or by omission. Since few users ever read sources, - * credits must appear in the documentation. - * - * 3. Altered versions must be plainly marked as such, and must not be - * misrepresented as being the original software. Since few users - * ever read sources, credits must appear in the documentation. - * - * 4. This notice may not be removed or altered. - */ - #include "file.h" +#include "magic.h" +#include <stdio.h> #include <string.h> #include <memory.h> #include <ctype.h> @@ -45,7 +54,7 @@ #include "names.h" #ifndef lint -FILE_RCSID("@(#)$Id: ascmagic.c,v 1.33 2003/02/08 18:33:53 christos Exp $") +FILE_RCSID("@(#)$Id: ascmagic.c,v 1.40 2003/11/20 00:25:39 christos Exp $") #endif /* lint */ typedef unsigned long unichar; @@ -54,29 +63,29 @@ typedef unsigned long unichar; #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') -static int looks_ascii(const unsigned char *, int, unichar *, int *); -static int looks_utf8(const unsigned char *, int, unichar *, int *); -static int looks_unicode(const unsigned char *, int, unichar *, int *); -static int looks_latin1(const unsigned char *, int, unichar *, int *); -static int looks_extended(const unsigned char *, int, unichar *, int *); -static void from_ebcdic(const unsigned char *, int, unsigned char *); -static int ascmatch(const unsigned char *, const unichar *, int); - -/* int nbytes: size actually read */ -int -ascmagic(unsigned char *buf, int nbytes) +private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); +private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); +private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); +private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); +private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); +private void from_ebcdic(const unsigned char *, size_t, unsigned char *); +private int ascmatch(const unsigned char *, const unichar *, size_t); + + +protected int +file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) { - int i; + size_t i; unsigned char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */ unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */ - int ulen; + size_t ulen; struct names *p; - char *code = NULL; - char *code_mime = NULL; - char *type = NULL; - char *subtype = NULL; - char *subtype_mime = NULL; + const char *code = NULL; + const char *code_mime = NULL; + const char *type = NULL; + const char *subtype = NULL; + const char *subtype_mime = NULL; int has_escapes = 0; int has_backspace = 0; @@ -90,20 +99,6 @@ ascmagic(unsigned char *buf, int nbytes) int has_long_lines = 0; /* - * Do the tar test first, because if the first file in the tar - * archive starts with a dot, we can confuse it with an nroff file. - */ - switch (is_tar(buf, nbytes)) { - case 1: - ckfputs(iflag ? "application/x-tar" : "tar archive", stdout); - return 1; - case 2: - ckfputs(iflag ? "application/x-tar, POSIX" - : "POSIX tar archive", stdout); - return 1; - } - - /* * Undo the NUL-termination kindly provided by process() * but leave at least one byte to look at */ @@ -111,6 +106,10 @@ ascmagic(unsigned char *buf, int nbytes) while (nbytes > 1 && buf[nbytes - 1] == '\0') nbytes--; + /* nbuf and ubuf relies on this */ + if (nbytes > HOWMANY) + nbytes = HOWMANY; + /* * Then try to determine whether it's any character code we can * identify. Each of these tests, if it succeeds, will leave @@ -125,7 +124,7 @@ ascmagic(unsigned char *buf, int nbytes) code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; - } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) { + } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else @@ -171,8 +170,10 @@ ascmagic(unsigned char *buf, int nbytes) while (ISSPC(*tp)) ++tp; /* skip leading whitespace */ if ((tp[0] == '\\' && tp[1] == '\"') || - (isascii(tp[0]) && isalnum(tp[0]) && - isascii(tp[1]) && isalnum(tp[1]) && + (isascii((unsigned char)tp[0]) && + isalnum((unsigned char)tp[0]) && + isascii((unsigned char)tp[1]) && + isalnum((unsigned char)tp[1]) && ISSPC(tp[2]))) { subtype_mime = "text/troff"; subtype = "troff or preprocessor input"; @@ -190,7 +191,7 @@ ascmagic(unsigned char *buf, int nbytes) i = 0; while (i < ulen) { - int end; + size_t end; /* * skip past any leading space @@ -211,7 +212,7 @@ ascmagic(unsigned char *buf, int nbytes) * compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { - if (ascmatch((unsigned char *)p->name, ubuf + i, + if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { subtype = types[p->type].human; subtype_mime = types[p->type].mime; @@ -244,7 +245,7 @@ subtype_identified: n_cr++; last_line_end = i; } - if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) { + if (ubuf[i] == '\n' && ((int)i - 1 < 0 || ubuf[i - 1] != '\r')){ n_lf++; last_line_end = i; } @@ -254,29 +255,40 @@ subtype_identified: } } - if (iflag) { - if (subtype_mime) - ckfputs(subtype_mime, stdout); - else - ckfputs("text/plain", stdout); + if ((ms->flags & MAGIC_MIME)) { + if (subtype_mime) { + if (file_printf(ms, subtype_mime) == -1) + return -1; + } else { + if (file_printf(ms, "text/plain") == -1) + return -1; + } if (code_mime) { - ckfputs("; charset=", stdout); - ckfputs(code_mime, stdout); + if (file_printf(ms, "; charset=") == -1) + return -1; + if (file_printf(ms, code_mime) == -1) + return -1; } } else { - ckfputs(code, stdout); + if (file_printf(ms, code) == -1) + return -1; if (subtype) { - ckfputs(" ", stdout); - ckfputs(subtype, stdout); + if (file_printf(ms, " ") == -1) + return -1; + if (file_printf(ms, subtype) == -1) + return -1; } - ckfputs(" ", stdout); - ckfputs(type, stdout); + if (file_printf(ms, " ") == -1) + return -1; + if (file_printf(ms, type) == -1) + return -1; if (has_long_lines) - ckfputs(", with very long lines", stdout); + if (file_printf(ms, ", with very long lines") == -1) + return -1; /* * Only report line terminators if we find one other than LF, @@ -284,44 +296,56 @@ subtype_identified: */ if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { - ckfputs(", with", stdout); + if (file_printf(ms, ", with") == -1) + return -1; - if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) - ckfputs(" no", stdout); - else { + if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { + if (file_printf(ms, " no") == -1) + return -1; + } else { if (n_crlf) { - ckfputs(" CRLF", stdout); + if (file_printf(ms, " CRLF") == -1) + return -1; if (n_cr || n_lf || n_nel) - ckfputs(",", stdout); + if (file_printf(ms, ",") == -1) + return -1; } if (n_cr) { - ckfputs(" CR", stdout); + if (file_printf(ms, " CR") == -1) + return -1; if (n_lf || n_nel) - ckfputs(",", stdout); + if (file_printf(ms, ",") == -1) + return -1; } if (n_lf) { - ckfputs(" LF", stdout); + if (file_printf(ms, " LF") == -1) + return -1; if (n_nel) - ckfputs(",", stdout); + if (file_printf(ms, ",") == -1) + return -1; } if (n_nel) - ckfputs(" NEL", stdout); + if (file_printf(ms, " NEL") == -1) + return -1; } - ckfputs(" line terminators", stdout); + if (file_printf(ms, " line terminators") == -1) + return -1; } if (has_escapes) - ckfputs(", with escape sequences", stdout); + if (file_printf(ms, ", with escape sequences") == -1) + return -1; if (has_backspace) - ckfputs(", with overstriking", stdout); + if (file_printf(ms, ", with overstriking") == -1) + return -1; } return 1; } -static int -ascmatch(const unsigned char *s, const unichar *us, int ulen) +private int +ascmatch(const unsigned char *s, const unichar *us, size_t ulen) { size_t i; @@ -393,7 +417,7 @@ ascmatch(const unsigned char *s, const unichar *us, int ulen) #define I 2 /* character appears in ISO-8859 text */ #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ -static char text_chars[256] = { +private char text_chars[256] = { /* BEL BS HT LF FF CR */ F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ /* ESC */ @@ -415,8 +439,9 @@ static char text_chars[256] = { I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ }; -static int -looks_ascii(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) +private int +looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, + size_t *ulen) { int i; @@ -434,8 +459,8 @@ looks_ascii(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) return 1; } -static int -looks_latin1(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) +private int +looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { int i; @@ -453,8 +478,9 @@ looks_latin1(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) return 1; } -static int -looks_extended(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) +private int +looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, + size_t *ulen) { int i; @@ -472,8 +498,8 @@ looks_extended(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) return 1; } -int -looks_utf8(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) +private int +looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { int i, n; unichar c; @@ -534,8 +560,9 @@ done: return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ } -static int -looks_unicode(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) +private int +looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, + size_t *ulen) { int bigend; int i; @@ -562,7 +589,8 @@ looks_unicode(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) if (ubuf[*ulen - 1] == 0xfffe) return 0; - if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T) + if (ubuf[*ulen - 1] < 128 && + text_chars[(size_t)ubuf[*ulen - 1]] != T) return 0; } @@ -596,7 +624,7 @@ looks_unicode(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen) * between old-style and internationalized examples of text. */ -unsigned char ebcdic_to_ascii[] = { +private unsigned char ebcdic_to_ascii[] = { 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, @@ -615,6 +643,7 @@ unsigned char ebcdic_to_ascii[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 }; +#ifdef notdef /* * The following EBCDIC-to-ASCII table may relate more closely to reality, * or at least to modern reality. It comes from @@ -629,7 +658,7 @@ unsigned char ebcdic_to_ascii[] = { * cases for the NEL character can be taken out of the code. */ -unsigned char ebcdic_1047_to_8859[] = { +private unsigned char ebcdic_1047_to_8859[] = { 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, @@ -647,12 +676,13 @@ unsigned char ebcdic_1047_to_8859[] = { 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F }; +#endif /* * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. */ -static void -from_ebcdic(const unsigned char *buf, int nbytes, unsigned char *out) +private void +from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) { int i; |