From 354bb40e75d94466e91fe6960523612c9d17ccfb Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 2 Nov 2017 23:11:29 +0300 Subject: Add implementation --- mysql/strings/ctype-mb.c | 1502 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1502 insertions(+) create mode 100644 mysql/strings/ctype-mb.c (limited to 'mysql/strings/ctype-mb.c') diff --git a/mysql/strings/ctype-mb.c b/mysql/strings/ctype-mb.c new file mode 100644 index 0000000..784aab1 --- /dev/null +++ b/mysql/strings/ctype-mb.c @@ -0,0 +1,1502 @@ +/* Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include +#include "m_ctype.h" +#include "m_string.h" + + +size_t my_caseup_str_mb(const CHARSET_INFO *cs, char *str) +{ + uint32 l; + const uchar *map= cs->to_upper; + char *str_orig= str; + + while (*str) + { + /* Pointing after the '\0' is safe here. */ + if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen))) + str+= l; + else + { + *str= (char) map[(uchar)*str]; + str++; + } + } + return (size_t) (str - str_orig); +} + + +size_t my_casedn_str_mb(const CHARSET_INFO *cs, char *str) +{ + uint32 l; + const uchar *map= cs->to_lower; + char *str_orig= str; + + while (*str) + { + /* Pointing after the '\0' is safe here. */ + if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen))) + str+= l; + else + { + *str= (char) map[(uchar)*str]; + str++; + } + } + return (size_t) (str - str_orig); +} + + +static inline const MY_UNICASE_CHARACTER* +get_case_info_for_ch(const CHARSET_INFO *cs, uint page, uint offs) +{ + const MY_UNICASE_CHARACTER *p; + return cs->caseinfo ? ((p= cs->caseinfo->page[page]) ? &p[offs] : NULL) : NULL; +} + + +/* + For character sets which don't change octet length in case conversion. +*/ +size_t my_caseup_mb(const CHARSET_INFO *cs, char *src, size_t srclen, + char *dst MY_ATTRIBUTE((unused)), + size_t dstlen MY_ATTRIBUTE((unused))) +{ + uint32 l; + char *srcend= src + srclen; + const uchar *map= cs->to_upper; + + DBUG_ASSERT(cs->caseup_multiply == 1); + DBUG_ASSERT(src == dst && srclen == dstlen); + DBUG_ASSERT(cs->mbmaxlen == 2); + + while (src < srcend) + { + if ((l=my_ismbchar(cs, src, srcend))) + { + const MY_UNICASE_CHARACTER *ch; + if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) + { + *src++= ch->toupper >> 8; + *src++= ch->toupper & 0xFF; + } + else + src+= l; + } + else + { + *src=(char) map[(uchar) *src]; + src++; + } + } + return srclen; +} + + +size_t my_casedn_mb(const CHARSET_INFO *cs, char *src, size_t srclen, + char *dst MY_ATTRIBUTE((unused)), + size_t dstlen MY_ATTRIBUTE((unused))) +{ + uint32 l; + char *srcend= src + srclen; + const uchar *map=cs->to_lower; + + DBUG_ASSERT(cs->casedn_multiply == 1); + DBUG_ASSERT(src == dst && srclen == dstlen); + DBUG_ASSERT(cs->mbmaxlen == 2); + + while (src < srcend) + { + if ((l= my_ismbchar(cs, src, srcend))) + { + const MY_UNICASE_CHARACTER *ch; + if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) + { + *src++= ch->tolower >> 8; + *src++= ch->tolower & 0xFF; + } + else + src+= l; + } + else + { + *src= (char) map[(uchar)*src]; + src++; + } + } + return srclen; +} + + +/* + Case folding functions for character set + where case conversion can change string octet length. + For example, in EUCKR, + _euckr 0xA9A5 == "LATIN LETTER DOTLESS I" (Turkish letter) + is upper-cased to to + _euckr 0x49 "LATIN CAPITAL LETTER I" ('usual' letter I) + Length is reduced in this example from two bytes to one byte. +*/ +static size_t +my_casefold_mb_varlen(const CHARSET_INFO *cs, + char *src, size_t srclen, + char *dst, size_t dstlen MY_ATTRIBUTE((unused)), + const uchar *map, + size_t is_upper) +{ + char *srcend= src + srclen, *dst0= dst; + + DBUG_ASSERT(cs->mbmaxlen == 2); + + while (src < srcend) + { + size_t mblen= my_ismbchar(cs, src, srcend); + if (mblen) + { + const MY_UNICASE_CHARACTER *ch; + if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) + { + int code= is_upper ? ch->toupper : ch->tolower; + src+= 2; + if (code > 0xFF) + *dst++= code >> 8; + *dst++= code & 0xFF; + } + else + { + *dst++= *src++; + *dst++= *src++; + } + } + else + { + *dst++= (char) map[(uchar) *src++]; + } + } + return (size_t) (dst - dst0); +} + + +size_t +my_casedn_mb_varlen(const CHARSET_INFO *cs, char *src, size_t srclen, + char *dst, size_t dstlen) +{ + DBUG_ASSERT(dstlen >= srclen * cs->casedn_multiply); + DBUG_ASSERT(src != dst || cs->casedn_multiply == 1); + return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_lower, 0); +} + + +size_t +my_caseup_mb_varlen(const CHARSET_INFO *cs, char *src, size_t srclen, + char *dst, size_t dstlen) +{ + DBUG_ASSERT(dstlen >= srclen * cs->caseup_multiply); + DBUG_ASSERT(src != dst || cs->caseup_multiply == 1); + return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_upper, 1); +} + + +/* + my_strcasecmp_mb() returns 0 if strings are equal, non-zero otherwise. + */ + +int my_strcasecmp_mb(const CHARSET_INFO *cs,const char *s, const char *t) +{ + uint32 l; + const uchar *map=cs->to_upper; + + while (*s && *t) + { + /* Pointing after the '\0' is safe here. */ + if ((l=my_ismbchar(cs, s, s + cs->mbmaxlen))) + { + while (l--) + if (*s++ != *t++) + return 1; + } + else if (my_mbcharlen(cs, *t) != 1 || + map[(uchar) *s++] != map[(uchar) *t++]) + return 1; + } + /* At least one of '*s' and '*t' is zero here. */ + DBUG_ASSERT(!*t || !*s); + return (*t != *s); +} + + +/* +** Compare string against string with wildcard +** 0 if matched +** -1 if not matched with wildcard +** 1 if matched with wildcard +*/ + +#define INC_PTR(cs,A,B) A+=(my_ismbchar(cs,A,B) ? my_ismbchar(cs,A,B) : 1) + +#define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)] + +static +int my_wildcmp_mb_impl(const CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, int recurse_level) +{ + int result= -1; /* Not found, using wildcards */ + + if (my_string_stack_guard && my_string_stack_guard(recurse_level)) + return 1; + while (wildstr != wildend) + { + while (*wildstr != w_many && *wildstr != w_one) + { + int l; + if (*wildstr == escape && wildstr+1 != wildend) + wildstr++; + if ((l = my_ismbchar(cs, wildstr, wildend))) + { + if (str+l > str_end || memcmp(str, wildstr, l) != 0) + return 1; + str += l; + wildstr += l; + } + else + if (str == str_end || likeconv(cs,*wildstr++) != likeconv(cs,*str++)) + return(1); /* No match */ + if (wildstr == wildend) + return (str != str_end); /* Match if both are at end */ + result=1; /* Found an anchor char */ + } + if (*wildstr == w_one) + { + do + { + if (str == str_end) /* Skip one char if possible */ + return (result); + INC_PTR(cs,str,str_end); + } while (++wildstr < wildend && *wildstr == w_one); + if (wildstr == wildend) + break; + } + if (*wildstr == w_many) + { /* Found w_many */ + uchar cmp; + const char* mb = wildstr; + int mb_len=0; + + wildstr++; + /* Remove any '%' and '_' from the wild search string */ + for (; wildstr != wildend ; wildstr++) + { + if (*wildstr == w_many) + continue; + if (*wildstr == w_one) + { + if (str == str_end) + return (-1); + INC_PTR(cs,str,str_end); + continue; + } + break; /* Not a wild character */ + } + if (wildstr == wildend) + return(0); /* Ok if w_many is last */ + if (str == str_end) + return -1; + + if ((cmp= *wildstr) == escape && wildstr+1 != wildend) + cmp= *++wildstr; + + mb=wildstr; + mb_len= my_ismbchar(cs, wildstr, wildend); + INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */ + cmp=likeconv(cs,cmp); + do + { + for (;;) + { + if (str >= str_end) + return -1; + if (mb_len) + { + if (str+mb_len <= str_end && memcmp(str, mb, mb_len) == 0) + { + str += mb_len; + break; + } + } + else if (!my_ismbchar(cs, str, str_end) && + likeconv(cs,*str) == cmp) + { + str++; + break; + } + INC_PTR(cs,str, str_end); + } + { + int tmp=my_wildcmp_mb_impl(cs,str,str_end, + wildstr,wildend,escape,w_one, + w_many, recurse_level + 1); + if (tmp <= 0) + return (tmp); + } + } while (str != str_end && wildstr[0] != w_many); + return(-1); + } + } + return (str != str_end ? 1 : 0); +} + +int my_wildcmp_mb(const CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many) +{ + return my_wildcmp_mb_impl(cs, str, str_end, + wildstr, wildend, + escape, w_one, w_many, 1); +} + + +size_t my_numchars_mb(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const char *pos, const char *end) +{ + size_t count= 0; + while (pos < end) + { + uint mb_len; + pos+= (mb_len= my_ismbchar(cs,pos,end)) ? mb_len : 1; + count++; + } + return count; +} + + +size_t my_charpos_mb(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const char *pos, const char *end, size_t length) +{ + const char *start= pos; + + while (length && pos < end) + { + uint mb_len; + pos+= (mb_len= my_ismbchar(cs, pos, end)) ? mb_len : 1; + length--; + } + return (size_t) (length ? end+2-start : pos-start); +} + + +size_t my_well_formed_len_mb(const CHARSET_INFO *cs, const char *b, + const char *e, size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + my_wc_t wc; + int mb_len; + + if ((mb_len= cs->cset->mb_wc(cs, &wc, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + +uint my_instr_mb(const CHARSET_INFO *cs, + const char *b, size_t b_length, + const char *s, size_t s_length, + my_match_t *match, uint nmatch) +{ + const char *end, *b0; + int res= 0; + + if (s_length <= b_length) + { + if (!s_length) + { + if (nmatch) + { + match->beg= 0; + match->end= 0; + match->mb_len= 0; + } + return 1; /* Empty string is always found */ + } + + b0= b; + end= b+b_length-s_length+1; + + while (b < end) + { + int mb_len; + + if (!cs->coll->strnncoll(cs, (uchar*) b, s_length, + (uchar*) s, s_length, 0)) + { + if (nmatch) + { + match[0].beg= 0; + match[0].end= (uint) (b-b0); + match[0].mb_len= res; + if (nmatch > 1) + { + match[1].beg= match[0].end; + match[1].end= match[0].end + (uint)s_length; + match[1].mb_len= 0; /* Not computed */ + } + } + return 2; + } + mb_len= (mb_len= my_ismbchar(cs, b, end)) ? mb_len : 1; + b+= mb_len; + b_length-= mb_len; + res++; + } + } + return 0; +} + + +/* BINARY collations handlers for MB charsets */ + +int +my_strnncoll_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + size_t len= MY_MIN(slen,tlen); + int cmp= memcmp(s,t,len); + return cmp ? cmp : (int) ((t_is_prefix ? len : slen) - tlen); +} + + +/* + Compare two strings. + + SYNOPSIS + my_strnncollsp_mb_bin() + cs Chararacter set + s String to compare + slen Length of 's' + t String to compare + tlen Length of 't' + diff_if_only_endspace_difference + Set to 1 if the strings should be regarded as different + if they only difference in end space + + NOTE + This function is used for character strings with binary collations. + The shorter string is extended with end space to be as long as the longer + one. + + RETURN + A negative number if s < t + A positive number if s > t + 0 if strings are equal +*/ + +int +my_strnncollsp_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const uchar *a, size_t a_length, + const uchar *b, size_t b_length, + my_bool diff_if_only_endspace_difference) +{ + const uchar *end; + size_t length; + int res; + +#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE + diff_if_only_endspace_difference= 0; +#endif + + end= a + (length= MY_MIN(a_length, b_length)); + while (a < end) + { + if (*a++ != *b++) + return ((int) a[-1] - (int) b[-1]); + } + res= 0; + if (a_length != b_length) + { + int swap= 1; + if (diff_if_only_endspace_difference) + res= 1; /* Assume 'a' is bigger */ + /* + Check the next not space character of the longer key. If it's < ' ', + then it's smaller than the other key. + */ + if (a_length < b_length) + { + /* put shorter key in s */ + a_length= b_length; + a= b; + swap= -1; /* swap sign of result */ + res= -res; + } + for (end= a + a_length-length; a < end ; a++) + { + if (*a != ' ') + return (*a < ' ') ? -swap : swap; + } + } + return res; +} + + +/* + Copy one non-ascii character. + "dst" must have enough room for the character. + Note, we don't use sort_order[] in this macros. + This is correct even for case insensitive collations: + - basic Latin letters are processed outside this macros; + - for other characters sort_order[x] is equal to x. +*/ +#define my_strnxfrm_mb_non_ascii_char(cs, dst, src, se) \ +{ \ + switch (cs->cset->ismbchar(cs, (const char*) src, (const char*) se)) { \ + case 4: \ + *dst++= *src++; \ + /* fall through */ \ + case 3: \ + *dst++= *src++; \ + /* fall through */ \ + case 2: \ + *dst++= *src++; \ + /* fall through */ \ + case 0: \ + *dst++= *src++; /* byte in range 0x80..0xFF which is not MB head */ \ + } \ +} + + +/* + For character sets with two or three byte multi-byte + characters having multibyte weights *equal* to their codes: + cp932, euckr, gb2312, sjis, eucjpms, ujis. +*/ +size_t +my_strnxfrm_mb(const CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *d0= dst; + uchar *de= dst + dstlen; + const uchar *se= src + srclen; + const uchar *sort_order= cs->sort_order; + + DBUG_ASSERT(cs->mbmaxlen <= 4); + + /* + If "srclen" is smaller than both "dstlen" and "nweights" + then we can run a simplified loop - + without checking "nweights" and "de". + */ + if (dstlen >= srclen && nweights >= srclen) + { + if (sort_order) + { + /* Optimized version for a case insensitive collation */ + for (; src < se; nweights--) + { + if (*src < 128) /* quickly catch ASCII characters */ + *dst++= sort_order[*src++]; + else + my_strnxfrm_mb_non_ascii_char(cs, dst, src, se); + } + } + else + { + /* Optimized version for a case sensitive collation (no sort_order) */ + for (; src < se; nweights--) + { + if (*src < 128) /* quickly catch ASCII characters */ + *dst++= *src++; + else + my_strnxfrm_mb_non_ascii_char(cs, dst, src, se); + } + } + goto pad; + } + + /* + A thourough loop, checking all possible limits: + "se", "nweights" and "de". + */ + for (; src < se && nweights && dst < de; nweights--) + { + int chlen; + if (*src < 128 || + !(chlen= cs->cset->ismbchar(cs, (const char*) src, (const char*) se))) + { + /* Single byte character */ + *dst++= sort_order ? sort_order[*src++] : *src++; + } + else + { + /* Multi-byte character */ + size_t len= (dst + chlen <= de) ? chlen : de - dst; + memcpy(dst, src, len); + dst+= len; + src+= len; + } + } + +pad: + return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0); +} + + +int +my_strcasecmp_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const char *s, const char *t) +{ + return strcmp(s,t); +} + + +void +my_hash_sort_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), + const uchar *key, size_t len,ulong *nr1, ulong *nr2) +{ + const uchar *pos = key; + + /* + Remove trailing spaces. We have to do this to be able to compare + 'A ' and 'A' as identical + */ + key= skip_trailing_space(key, len); + + for (; pos < (uchar*) key ; pos++) + { + nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * + ((uint)*pos)) + (nr1[0] << 8); + nr2[0]+=3; + } +} + + +/* + Fill the given buffer with 'maximum character' for given charset + SYNOPSIS + pad_max_char() + cs Character set + str Start of buffer to fill + end End of buffer to fill + + DESCRIPTION + Write max key: + - for non-Unicode character sets: + just memset using max_sort_char if max_sort_char is one byte. + In case when max_sort_char is two bytes, fill with double-byte pairs + and optionally pad with a single space character. + - for Unicode character set (utf-8): + create a buffer with multibyte representation of the max_sort_char + character, and copy it into max_str in a loop. +*/ +static void pad_max_char(const CHARSET_INFO *cs, char *str, char *end) +{ + char buf[10]; + char buflen; + + if (!(cs->state & MY_CS_UNICODE)) + { + if (cs->max_sort_char <= 255) + { + memset(str, cs->max_sort_char, end - str); + return; + } + else if (cs->max_sort_char <= 0xFFFF) + { + buf[0]= (char)(cs->max_sort_char >> 8); + buf[1]= cs->max_sort_char & 0xFF; + buflen= 2; + } + else + { + /* Currently, it's only for GB18030, so it must be a 4-byte char */ + DBUG_ASSERT(cs->max_sort_char > 0xFFFFFF); + buf[0]= cs->max_sort_char >> 24 & 0xFF; + buf[1]= cs->max_sort_char >> 16 & 0xFF; + buf[2]= cs->max_sort_char >> 8 & 0xFF; + buf[3]= cs->max_sort_char & 0xFF; + buflen= 4; + } + } + else + { + buflen= cs->cset->wc_mb(cs, cs->max_sort_char, (uchar*) buf, + (uchar*) buf + sizeof(buf)); + } + + DBUG_ASSERT(buflen > 0); + do + { + if ((str + buflen) <= end) + { + /* Enough space for the characer */ + memcpy(str, buf, buflen); + str+= buflen; + } + else + { + /* + There is no space for whole multibyte + character, then add trailing spaces. + */ + *str++= ' '; + } + } while (str < end); +} + +/* +** Calculate min_str and max_str that ranges a LIKE string. +** Arguments: +** ptr Pointer to LIKE string. +** ptr_length Length of LIKE string. +** escape Escape character in LIKE. (Normally '\'). +** All escape characters should be removed from min_str and max_str +** res_length Length of min_str and max_str. +** min_str Smallest case sensitive string that ranges LIKE. +** Should be space padded to res_length. +** max_str Largest case sensitive string that ranges LIKE. +** Normally padded with the biggest character sort value. +** +** The function should return 0 if ok and 1 if the LIKE string can't be +** optimized ! +*/ + +my_bool my_like_range_mb(const CHARSET_INFO *cs, + const char *ptr,size_t ptr_length, + pbool escape, pbool w_one, pbool w_many, + size_t res_length, + char *min_str,char *max_str, + size_t *min_length,size_t *max_length) +{ + uint mb_len; + const char *end= ptr + ptr_length; + char *min_org= min_str; + char *min_end= min_str + res_length; + char *max_end= max_str + res_length; + size_t maxcharlen= res_length / cs->mbmaxlen; + const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0); + + for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--) + { + /* We assume here that escape, w_any, w_namy are one-byte characters */ + if (*ptr == escape && ptr+1 != end) + ptr++; /* Skip escape */ + else if (*ptr == w_one || *ptr == w_many) /* '_' and '%' in SQL */ + { +fill_max_and_min: + /* + Calculate length of keys: + 'a\0\0... is the smallest possible string when we have space expand + a\ff\ff... is the biggest possible string + */ + *min_length= ((cs->state & MY_CS_BINSORT) ? (size_t) (min_str - min_org) : + res_length); + *max_length= res_length; + /* Create min key */ + do + { + *min_str++= (char) cs->min_sort_char; + } while (min_str != min_end); + + /* + Write max key: create a buffer with multibyte + representation of the max_sort_char character, + and copy it into max_str in a loop. + */ + *max_length= res_length; + pad_max_char(cs, max_str, max_end); + return 0; + } + if ((mb_len= my_ismbchar(cs, ptr, end)) > 1) + { + if (ptr+mb_len > end || min_str+mb_len > min_end) + break; + while (mb_len--) + *min_str++= *max_str++= *ptr++; + } + else + { + /* + Special case for collations with contractions. + For example, in Chezh, 'ch' is a separate letter + which is sorted between 'h' and 'i'. + If the pattern 'abc%', 'c' at the end can mean: + - letter 'c' itself, + - beginning of the contraction 'ch'. + + If we simply return this LIKE range: + + 'abc\min\min\min' and 'abc\max\max\max' + + then this query: SELECT * FROM t1 WHERE a LIKE 'abc%' + will only find values starting from 'abc[^h]', + but won't find values starting from 'abch'. + + We must ignore contraction heads followed by w_one or w_many. + ('Contraction head' means any letter which can be the first + letter in a contraction) + + For example, for Czech 'abc%', we will return LIKE range, + which is equal to LIKE range for 'ab%': + + 'ab\min\min\min\min' and 'ab\max\max\max\max'. + + */ + if (contractions && ptr + 1 < end && + my_uca_can_be_contraction_head(contractions, (uchar) *ptr)) + { + /* Ptr[0] is a contraction head. */ + + if (ptr[1] == w_one || ptr[1] == w_many) + { + /* Contraction head followed by a wildcard, quit. */ + goto fill_max_and_min; + } + + /* + Some letters can be both contraction heads and contraction tails. + For example, in Danish 'aa' is a separate single letter which + is sorted after 'z'. So 'a' can be both head and tail. + + If ptr[0]+ptr[1] is a contraction, + then put both letters together. + + If ptr[1] can be a contraction part, but ptr[0]+ptr[1] + is not a contraction, then we put only ptr[0], + and continue with ptr[1] on the next loop. + */ + if (my_uca_can_be_contraction_tail(contractions, (uchar) ptr[1]) && + my_uca_contraction2_weight(contractions, (uchar) ptr[0], ptr[1])) + { + /* Contraction found */ + if (maxcharlen == 1 || min_str + 1 >= min_end) + { + /* Both contraction parts don't fit, quit */ + goto fill_max_and_min; + } + + /* Put contraction head */ + *min_str++= *max_str++= *ptr++; + maxcharlen--; + } + } + /* Put contraction tail, or a single character */ + *min_str++= *max_str++= *ptr++; + } + } + + *min_length= *max_length = (size_t) (min_str - min_org); + while (min_str != min_end) + *min_str++= *max_str++= ' '; /* Because if key compression */ + return 0; +} + + +/** + Calculate min_str and max_str that ranges a LIKE string. + Generic function, currently used for ucs2, utf16, utf32, + but should be suitable for any other character sets with + cs->min_sort_char and cs->max_sort_char represented in + Unicode code points. + + @param cs Character set and collation pointer + @param ptr Pointer to LIKE pattern. + @param ptr_length Length of LIKE pattern. + @param escape Escape character pattern, typically '\'. + @param w_one 'One character' pattern, typically '_'. + @param w_many 'Many characters' pattern, typically '%'. + @param res_length Length of min_str and max_str. + + @param[out] min_str Smallest string that ranges LIKE. + @param[out] max_str Largest string that ranges LIKE. + @param[out] min_len Length of min_str + @param[out] max_len Length of max_str + + @return Optimization status. + @retval FALSE if LIKE pattern can be optimized + @rerval TRUE if LIKE can't be optimized. +*/ +my_bool +my_like_range_generic(const CHARSET_INFO *cs, + const char *ptr, size_t ptr_length, + pbool escape, pbool w_one, pbool w_many, + size_t res_length, + char *min_str,char *max_str, + size_t *min_length,size_t *max_length) +{ + const char *end= ptr + ptr_length; + const char *min_org= min_str; + const char *max_org= max_str; + char *min_end= min_str + res_length; + char *max_end= max_str + res_length; + size_t charlen= res_length / cs->mbmaxlen; + size_t res_length_diff; + const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0); + + for ( ; charlen > 0; charlen--) + { + my_wc_t wc, wc2; + int res; + if ((res= cs->cset->mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0) + { + if (res == MY_CS_ILSEQ) /* Bad sequence */ + return TRUE; /* min_length and max_length are not important */ + break; /* End of the string */ + } + ptr+= res; + + if (wc == (my_wc_t) escape) + { + if ((res= cs->cset->mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0) + { + if (res == MY_CS_ILSEQ) + return TRUE; /* min_length and max_length are not important */ + /* + End of the string: Escape is the last character. + Put escape as a normal character. + We'll will leave the loop on the next iteration. + */ + } + else + ptr+= res; + + /* Put escape character to min_str and max_str */ + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) min_str, (uchar*) min_end)) <= 0) + goto pad_set_lengths; /* No space */ + min_str+= res; + + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) max_str, (uchar*) max_end)) <= 0) + goto pad_set_lengths; /* No space */ + max_str+= res; + continue; + } + else if (wc == (my_wc_t) w_one) + { + if ((res= cs->cset->wc_mb(cs, cs->min_sort_char, + (uchar*) min_str, (uchar*) min_end)) <= 0) + goto pad_set_lengths; + min_str+= res; + + if ((res= cs->cset->wc_mb(cs, cs->max_sort_char, + (uchar*) max_str, (uchar*) max_end)) <= 0) + goto pad_set_lengths; + max_str+= res; + continue; + } + else if (wc == (my_wc_t) w_many) + { + /* + Calculate length of keys: + a\min\min... is the smallest possible string + a\max\max... is the biggest possible string + */ + *min_length= ((cs->state & MY_CS_BINSORT) ? + (size_t) (min_str - min_org) : + res_length); + *max_length= res_length; + goto pad_min_max; + } + + if (contractions && + my_uca_can_be_contraction_head(contractions, wc) && + (res= cs->cset->mb_wc(cs, &wc2, (uchar*) ptr, (uchar*) end)) > 0) + { + uint16 *weight; + if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many)) + { + /* Contraction head followed by a wildcard */ + *min_length= *max_length= res_length; + goto pad_min_max; + } + + if (my_uca_can_be_contraction_tail(contractions, wc2) && + (weight= my_uca_contraction2_weight(contractions, wc, wc2)) && weight[0]) + { + /* Contraction found */ + if (charlen == 1) + { + /* contraction does not fit to result */ + *min_length= *max_length= res_length; + goto pad_min_max; + } + + ptr+= res; + charlen--; + + /* Put contraction head */ + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) min_str, (uchar*) min_end)) <= 0) + goto pad_set_lengths; + min_str+= res; + + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) max_str, (uchar*) max_end)) <= 0) + goto pad_set_lengths; + max_str+= res; + wc= wc2; /* Prepare to put contraction tail */ + } + } + + /* Normal character, or contraction tail */ + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) min_str, (uchar*) min_end)) <= 0) + goto pad_set_lengths; + min_str+= res; + if ((res= cs->cset->wc_mb(cs, wc, + (uchar*) max_str, (uchar*) max_end)) <= 0) + goto pad_set_lengths; + max_str+= res; + } + +pad_set_lengths: + *min_length= (size_t) (min_str - min_org); + *max_length= (size_t) (max_str - max_org); + +pad_min_max: + /* + Fill up max_str and min_str to res_length. + fill() cannot set incomplete characters and + requires that "length" argument is divisible to mbminlen. + Make sure to call fill() with proper "length" argument. + */ + res_length_diff= res_length % cs->mbminlen; + cs->cset->fill(cs, min_str, min_end - min_str - res_length_diff, + cs->min_sort_char); + cs->cset->fill(cs, max_str, max_end - max_str - res_length_diff, + cs->max_sort_char); + + /* In case of incomplete characters set the remainder to 0x00's */ + if (res_length_diff) + { + /* Example: odd res_length for ucs2 */ + memset(min_end - res_length_diff, 0, res_length_diff); + memset(max_end - res_length_diff, 0, res_length_diff); + } + return FALSE; +} + + +static +int +my_wildcmp_mb_bin_impl(const CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, int recurse_level) +{ + int result= -1; /* Not found, using wildcards */ + + if (my_string_stack_guard && my_string_stack_guard(recurse_level)) + return 1; + while (wildstr != wildend) + { + while (*wildstr != w_many && *wildstr != w_one) + { + int l; + if (*wildstr == escape && wildstr+1 != wildend) + wildstr++; + if ((l = my_ismbchar(cs, wildstr, wildend))) + { + if (str+l > str_end || memcmp(str, wildstr, l) != 0) + return 1; + str += l; + wildstr += l; + } + else + if (str == str_end || *wildstr++ != *str++) + return(1); /* No match */ + if (wildstr == wildend) + return (str != str_end); /* Match if both are at end */ + result=1; /* Found an anchor char */ + } + if (*wildstr == w_one) + { + do + { + if (str == str_end) /* Skip one char if possible */ + return (result); + INC_PTR(cs,str,str_end); + } while (++wildstr < wildend && *wildstr == w_one); + if (wildstr == wildend) + break; + } + if (*wildstr == w_many) + { /* Found w_many */ + int cmp; + const char* mb = wildstr; + int mb_len=0; + + wildstr++; + /* Remove any '%' and '_' from the wild search string */ + for (; wildstr != wildend ; wildstr++) + { + if (*wildstr == w_many) + continue; + if (*wildstr == w_one) + { + if (str == str_end) + return (-1); + INC_PTR(cs,str,str_end); + continue; + } + break; /* Not a wild character */ + } + if (wildstr == wildend) + return(0); /* Ok if w_many is last */ + if (str == str_end) + return -1; + + if ((cmp= *wildstr) == escape && wildstr+1 != wildend) + cmp= *++wildstr; + + mb=wildstr; + mb_len= my_ismbchar(cs, wildstr, wildend); + INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */ + do + { + for (;;) + { + if (str >= str_end) + return -1; + if (mb_len) + { + if (str+mb_len <= str_end && memcmp(str, mb, mb_len) == 0) + { + str += mb_len; + break; + } + } + else if (!my_ismbchar(cs, str, str_end) && *str == cmp) + { + str++; + break; + } + INC_PTR(cs,str, str_end); + } + { + int tmp=my_wildcmp_mb_bin_impl(cs,str,str_end, + wildstr,wildend,escape, + w_one,w_many, recurse_level + 1); + if (tmp <= 0) + return (tmp); + } + } while (str != str_end && wildstr[0] != w_many); + return(-1); + } + } + return (str != str_end ? 1 : 0); +} + +int +my_wildcmp_mb_bin(const CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many) +{ + return my_wildcmp_mb_bin_impl(cs, str, str_end, + wildstr, wildend, + escape, w_one, w_many, 1); +} + + +/* + Data was produced from EastAsianWidth.txt + using utt11-dump utility. +*/ +static const char pg11[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pg23[256]= +{ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pg2E[256]= +{ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pg2F[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0 +}; + +static const char pg30[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +static const char pg31[256]= +{ +0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +static const char pg32[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 +}; + +static const char pg4D[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pg9F[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pgA4[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pgD7[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pgFA[256]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pgFE[256]= +{ +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const char pgFF[256]= +{ +0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static const struct {int page; const char *p;} utr11_data[256]= +{ +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,pg11},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,pg23},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,pg2E},{0,pg2F}, +{0,pg30},{0,pg31},{0,pg32},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pg4D},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pg9F}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pgA4},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL}, +{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{1,NULL},{0,pgD7}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL},{0,NULL}, +{0,NULL},{1,NULL},{0,pgFA},{0,NULL},{0,NULL},{0,NULL},{0,pgFE},{0,pgFF} +}; + + +size_t my_numcells_mb(const CHARSET_INFO *cs, const char *b, const char *e) +{ + my_wc_t wc; + size_t clen= 0; + + while (b < e) + { + int mb_len; + uint pg; + if ((mb_len= cs->cset->mb_wc(cs, &wc, (uchar*) b, (uchar*) e)) <= 0 || + wc > 0xFFFF) + { + /* + Let's think a wrong sequence takes 1 dysplay cell. + Also, consider supplementary characters as taking one cell. + */ + mb_len= 1; + b++; + continue; + } + b+= mb_len; + if (wc > 0xFFFF) + { + if (wc >= 0x20000 && wc <= 0x3FFFD) /* CJK Ideograph Extension B, C */ + clen+= 1; + } + else + { + pg= (wc >> 8) & 0xFF; + clen+= utr11_data[pg].p ? utr11_data[pg].p[wc & 0xFF] : utr11_data[pg].page; + } + clen++; + } + return clen; +} + + +int my_mb_ctype_mb(const CHARSET_INFO *cs, int *ctype, + const uchar *s, const uchar *e) +{ + my_wc_t wc; + int res= cs->cset->mb_wc(cs, &wc, s, e); + if (res <= 0 || wc > 0xFFFF) + *ctype= 0; + else + *ctype= my_uni_ctype[wc>>8].ctype ? + my_uni_ctype[wc>>8].ctype[wc&0xFF] : + my_uni_ctype[wc>>8].pctype; + return res; +} + + +MY_COLLATION_HANDLER my_collation_mb_bin_handler = +{ + NULL, /* init */ + my_strnncoll_mb_bin, + my_strnncollsp_mb_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; -- cgit v1.1