Perl

NAVIGATION
CATEGORIES
REFERRENCE
LINKS
  • Change 24687: speed up is_utf8_char()

    0 answers - 4995 bytes - related search similar search Add To My Delicious Add To My Stumble Upon Add To My Google Mark Add To My Facebook Add To My Digg Add To My Reddit

    Change 24687 by rgs@bloom on 2005/06/03 08:08:25
    Subject: [PATCH] speed up is_utf8_char()
    From: Jarkko Hietaniemi <jhietaniemi (AT) gmail (DOT) com>
    Date: Thu, 02 Jun 2005 21:52:46 +0300
    Message-ID: <429F557E.3090007 (AT) gmail (DOT) com>
    Affected files
    //depot/perl/utf8.c#233 edit
    //depot/perl/utf8.h#68 edit
    Differences
    //depot/perl/utf8.c#233 (text)
    Index: perl/utf8.c
    perl/utf8.c#232~24622~Mon May 30 00:45:09 2005
    perl/utf8.cFri Jun 3 01:08:25 2005
    @@ -186,40 +186,48 @@
    STRLEN
    Perl_is_utf8_char(pTHX_ const U8 *s)
    {
    - U8 u = *s;
    - STRLEN slen, len;
    - UV uv, ouv;
    -
    - if (UTF8_IS_INVARIANT(u))
    -return 1;
    -
    - if (!UTF8_IS_START(u))
    -return 0;
    -
    + STRLEN len;
    +#ifdef IS_UTF8_CHAR
    len = UTF8SKIP(s);
    + if (len <= 4)
    + return IS_UTF8_CHAR(s, len) ? len : 0;
    +#endif /* #ifdef IS_UTF8_CHAR */
    + {
    +U8 u = *s;
    + STRLEN slen;
    + UV uv, ouv;
    +
    + if (UTF8_IS_INVARIANT(u))
    + return 1;
    +
    + if (!UTF8_IS_START(u))
    + return 0;
    +
    + len = UTF8SKIP(s);
    +
    + if (len < 2 || !UTF8_IS_CNTINUATIN(s[1]))
    + return 0;
    +
    + slen = len - 1;
    + s++;
    + u &= UTF_START_MASK(len);
    + uv = u;
    + ouv = uv;
    + while (slen--) {
    + if (!UTF8_IS_CNTINUATIN(*s))
    + return 0;
    + uv = UTF8_ACCUMULATE(uv, *s);
    + if (uv < ouv)
    + return 0;
    + ouv = uv;
    + s++;
    + }
    - if (len < 2 || !UTF8_IS_CNTINUATIN(s[1]))
    -return 0;
    + if ((STRLEN)UNISKIP(uv) < len)
    + return 0;
    - slen = len - 1;
    - s++;
    - u &= UTF_START_MASK(len);
    - uv = u;
    - ouv = uv;
    - while (slen--) {
    -if (!UTF8_IS_CNTINUATIN(*s))
    - return 0;
    -uv = UTF8_ACCUMULATE(uv, *s);
    -if (uv < ouv)
    - return 0;
    -ouv = uv;
    -s++;
    + return len;
    }
    -
    - if ((STRLEN)UNISKIP(uv) < len)
    -return 0;
    -
    - return len;
    }
    /*
    //depot/perl/utf8.h#68 (text)
    Index: perl/utf8.h
    perl/utf8.h#67~24271~Thu Apr 21 08:38:30 2005
    perl/utf8.hFri Jun 3 01:08:25 2005
    @@ -257,3 +257,73 @@
    toLWER((input)[1]) == 's')
    #endif
    #define SHARP_S_SKIP 2
    +
    +#define IS_UTF8_CHAR_1(p)\
    +((p)[0] <= 0x7F)
    +#define IS_UTF8_CHAR_2(p)\
    +((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF)
    +#define IS_UTF8_CHAR_3a(p)\
    +((p)[0] == 0xE0 && \
    + (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF)
    +#define IS_UTF8_CHAR_3b(p)\
    +((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF)
    +#define IS_UTF8_CHAR_3c(p)\
    +((p)[0] == 0xED && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF)
    +/* In IS_UTF8_CHAR_3c(p) one could use
    + * (p)[1] >= 0x80 && (p)[1] <= 0x9F
    + * if one wanted to exclude surrogates. */
    +#define IS_UTF8_CHAR_3d(p)\
    +((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF)
    +#define IS_UTF8_CHAR_4a(p)\
    +((p)[0] == 0xF0 && \
    + (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
    + (p)[3] >= 0x80 && (p)[3] <= 0xBF)
    +#define IS_UTF8_CHAR_4b(p)\
    +((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
    + (p)[3] >= 0x80 && (p)[3] <= 0xBF)
    +/* In IS_UTF8_CHAR_4c(p) one could use
    + * (p)[0] == 0xF4
    + * if one wanted to stop at the Unicode limit U+10FFFF.
    + * The 0xF7 allows us to go to 0x1fffff (0x200000 would
    + * require five bytes). Not doing any further code points
    + * since that is not needed (and that would not be strict
    + * UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
    + * will take care of the "extended UTF-8". */
    +#define IS_UTF8_CHAR_4c(p)\
    +((p)[0] == 0xF4 && (p)[0] <= 0xF7 && \
    + (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
    + (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
    + (p)[3] >= 0x80 && (p)[3] <= 0xBF)
    +
    +#define IS_UTF8_CHAR_3(p)\
    +(IS_UTF8_CHAR_3a(p) || \
    + IS_UTF8_CHAR_3b(p) || \
    + IS_UTF8_CHAR_3c(p) || \
    + IS_UTF8_CHAR_3d(p))
    +#define IS_UTF8_CHAR_4(p)\
    +(IS_UTF8_CHAR_4a(p) || \
    + IS_UTF8_CHAR_4b(p) || \
    + IS_UTF8_CHAR_4c(p))
    +
    +/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
    + * (1) allows UTF-8 encoded UTF-16 surrogates
    + * (2) it allows code points past U+10FFFF.
    + * The Perl_is_utf8_char() full "slow" code will handle the Perl
    + * "extended UTF-8". */
    +#define IS_UTF8_CHAR(p, n)\
    +((n) == 1 ? IS_UTF8_CHAR_1(p) : \
    + (n) == 2 ? IS_UTF8_CHAR_2(p) : \
    + (n) == 3 ? IS_UTF8_CHAR_3(p) : \
    + (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
    +
    End of Patch.

Re: Change 24687: speed up is_utf8_char()


max 4000 letters.
Your nickname that display:
In order to stop the spam: 1 + 0 =
QUESTION ON "Perl"

EMSDN.COM