Change 24687 by rgs@bloom on 2005/06/03 08:08:25
Subject: [PATCH] speed up is_utf8_char()
From: Jarkko Hietaniemi <jhietaniemi (AT) gmail (DOT) com>
Date: Thu, 02 Jun 2005 21:52:46 +0300
Message-ID: <429F557E.3090007 (AT) gmail (DOT) com>
Affected files
//depot/perl/utf8.c#233 edit
//depot/perl/utf8.h#68 edit
Differences
//depot/perl/utf8.c#233 (text)
Index: perl/utf8.c
perl/utf8.c#232~24622~Mon May 30 00:45:09 2005
perl/utf8.cFri Jun 3 01:08:25 2005
@@ -186,40 +186,48 @@
STRLEN
Perl_is_utf8_char(pTHX_ const U8 *s)
{
- U8 u = *s;
- STRLEN slen, len;
- UV uv, ouv;
-
- if (UTF8_IS_INVARIANT(u))
-return 1;
-
- if (!UTF8_IS_START(u))
-return 0;
-
+ STRLEN len;
+#ifdef IS_UTF8_CHAR
len = UTF8SKIP(s);
+ if (len <= 4)
+ return IS_UTF8_CHAR(s, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+ {
+U8 u = *s;
+ STRLEN slen;
+ UV uv, ouv;
+
+ if (UTF8_IS_INVARIANT(u))
+ return 1;
+
+ if (!UTF8_IS_START(u))
+ return 0;
+
+ len = UTF8SKIP(s);
+
+ if (len < 2 || !UTF8_IS_CNTINUATIN(s[1]))
+ return 0;
+
+ slen = len - 1;
+ s++;
+ u &= UTF_START_MASK(len);
+ uv = u;
+ ouv = uv;
+ while (slen--) {
+ if (!UTF8_IS_CNTINUATIN(*s))
+ return 0;
+ uv = UTF8_ACCUMULATE(uv, *s);
+ if (uv < ouv)
+ return 0;
+ ouv = uv;
+ s++;
+ }
- if (len < 2 || !UTF8_IS_CNTINUATIN(s[1]))
-return 0;
+ if ((STRLEN)UNISKIP(uv) < len)
+ return 0;
- slen = len - 1;
- s++;
- u &= UTF_START_MASK(len);
- uv = u;
- ouv = uv;
- while (slen--) {
-if (!UTF8_IS_CNTINUATIN(*s))
- return 0;
-uv = UTF8_ACCUMULATE(uv, *s);
-if (uv < ouv)
- return 0;
-ouv = uv;
-s++;
+ return len;
}
-
- if ((STRLEN)UNISKIP(uv) < len)
-return 0;
-
- return len;
}
/*
//depot/perl/utf8.h#68 (text)
Index: perl/utf8.h
perl/utf8.h#67~24271~Thu Apr 21 08:38:30 2005
perl/utf8.hFri Jun 3 01:08:25 2005
@@ -257,3 +257,73 @@
toLWER((input)[1]) == 's')
#endif
#define SHARP_S_SKIP 2
+
+#define IS_UTF8_CHAR_1(p)\
+((p)[0] <= 0x7F)
+#define IS_UTF8_CHAR_2(p)\
+((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF)
+#define IS_UTF8_CHAR_3a(p)\
+((p)[0] == 0xE0 && \
+ (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_3b(p)\
+((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_3c(p)\
+((p)[0] == 0xED && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+/* In IS_UTF8_CHAR_3c(p) one could use
+ * (p)[1] >= 0x80 && (p)[1] <= 0x9F
+ * if one wanted to exclude surrogates. */
+#define IS_UTF8_CHAR_3d(p)\
+((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_4a(p)\
+((p)[0] == 0xF0 && \
+ (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+ (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+#define IS_UTF8_CHAR_4b(p)\
+((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+ (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+/* In IS_UTF8_CHAR_4c(p) one could use
+ * (p)[0] == 0xF4
+ * if one wanted to stop at the Unicode limit U+10FFFF.
+ * The 0xF7 allows us to go to 0x1fffff (0x200000 would
+ * require five bytes). Not doing any further code points
+ * since that is not needed (and that would not be strict
+ * UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
+ * will take care of the "extended UTF-8". */
+#define IS_UTF8_CHAR_4c(p)\
+((p)[0] == 0xF4 && (p)[0] <= 0xF7 && \
+ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+ (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+ (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+
+#define IS_UTF8_CHAR_3(p)\
+(IS_UTF8_CHAR_3a(p) || \
+ IS_UTF8_CHAR_3b(p) || \
+ IS_UTF8_CHAR_3c(p) || \
+ IS_UTF8_CHAR_3d(p))
+#define IS_UTF8_CHAR_4(p)\
+(IS_UTF8_CHAR_4a(p) || \
+ IS_UTF8_CHAR_4b(p) || \
+ IS_UTF8_CHAR_4c(p))
+
+/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
+ * (1) allows UTF-8 encoded UTF-16 surrogates
+ * (2) it allows code points past U+10FFFF.
+ * The Perl_is_utf8_char() full "slow" code will handle the Perl
+ * "extended UTF-8". */
+#define IS_UTF8_CHAR(p, n)\
+((n) == 1 ? IS_UTF8_CHAR_1(p) : \
+ (n) == 2 ? IS_UTF8_CHAR_2(p) : \
+ (n) == 3 ? IS_UTF8_CHAR_3(p) : \
+ (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
+
End of Patch.