LCOV - code coverage report
Current view: top level - lib/util/charset - util_str.c (source / functions) Hit Total Coverage
Test: coverage report for abartlet/fix-coverage dd10fb34 Lines: 202 233 86.7 %
Date: 2021-09-23 10:06:22 Functions: 20 20 100.0 %

          Line data    Source code
       1             : /*
       2             :    Unix SMB/CIFS implementation.
       3             :    Samba utility functions
       4             :    Copyright (C) Andrew Tridgell 1992-2001
       5             :    Copyright (C) Simo Sorce 2001
       6             :    Copyright (C) Andrew Bartlett 2011
       7             :    Copyright (C) Jeremy Allison  1992-2007
       8             :    Copyright (C) Martin Pool     2003
       9             :    Copyright (C) James Peach     2006
      10             : 
      11             :    This program is free software; you can redistribute it and/or modify
      12             :    it under the terms of the GNU General Public License as published by
      13             :    the Free Software Foundation; either version 3 of the License, or
      14             :    (at your option) any later version.
      15             : 
      16             :    This program is distributed in the hope that it will be useful,
      17             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             :    GNU General Public License for more details.
      20             : 
      21             :    You should have received a copy of the GNU General Public License
      22             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.
      23             : */
      24             : 
      25             : #include "replace.h"
      26             : #include "system/locale.h"
      27             : #include "charset.h"
      28             : #include "lib/util/fault.h"
      29             : 
      30             : #ifdef strcasecmp
      31             : #undef strcasecmp
      32             : #endif
      33             : #ifdef strncasecmp
      34             : #undef strncasecmp
      35             : #endif
      36             : 
      37             : 
      38             : /**
      39             :  Case insensitive string compararison, handle specified for testing
      40             : **/
      41   370816012 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
      42             :                                  const char *s1, const char *s2)
      43             : {
      44   370816012 :         codepoint_t c1=0, c2=0;
      45   370816012 :         codepoint_t u1=0, u2=0;
      46   370816012 :         codepoint_t l1=0, l2=0;
      47             :         size_t size1, size2;
      48             : 
      49             :         /* handle null ptr comparisons to simplify the use in qsort */
      50   370816012 :         if (s1 == s2) return 0;
      51   370788439 :         if (s1 == NULL) return -1;
      52   370788437 :         if (s2 == NULL) return 1;
      53             : 
      54  1686422209 :         while (*s1 && *s2) {
      55  1332576598 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
      56  1332576598 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
      57             : 
      58  1335941653 :                 if (c1 == INVALID_CODEPOINT ||
      59     3365055 :                     c2 == INVALID_CODEPOINT) {
      60           9 :                         return strcasecmp(s1, s2);
      61             :                 }
      62             : 
      63  1332576589 :                 s1 += size1;
      64  1332576589 :                 s2 += size2;
      65             : 
      66  1332576589 :                 if (c1 == c2) {
      67   974487766 :                         continue;
      68             :                 }
      69             : 
      70   358088823 :                 u1 = toupper_m(c1);
      71   358088823 :                 u2 = toupper_m(c2);
      72   358088823 :                 if (u1 == u2) {
      73      829109 :                         continue;
      74             :                 }
      75             : 
      76   357259714 :                 l1 = tolower_m(c1);
      77   357259714 :                 l2 = tolower_m(c2);
      78   357259714 :                 if (l1 == l2) {
      79           0 :                         continue;
      80             :                 }
      81             : 
      82   357259714 :                 return l1 - l2;
      83             :         }
      84             : 
      85    13528712 :         return *s1 - *s2;
      86             : }
      87             : 
      88             : /**
      89             :  Case insensitive string compararison
      90             : **/
      91   370815994 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
      92             : {
      93   370815994 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
      94   370815994 :         return strcasecmp_m_handle(iconv_handle, s1, s2);
      95             : }
      96             : 
      97             : /**
      98             :  Case insensitive string compararison, length limited, handle specified for testing
      99             : **/
     100     4313331 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
     101             :                                   const char *s1, const char *s2, size_t n)
     102             : {
     103     4313331 :         codepoint_t c1=0, c2=0;
     104     4313331 :         codepoint_t u1=0, u2=0;
     105     4313331 :         codepoint_t l1=0, l2=0;
     106             :         size_t size1, size2;
     107             : 
     108             :         /* handle null ptr comparisons to simplify the use in qsort */
     109     4313331 :         if (s1 == s2) return 0;
     110     4313199 :         if (s1 == NULL) return -1;
     111     4313198 :         if (s2 == NULL) return 1;
     112             : 
     113    19378146 :         while (*s1 && *s2 && n) {
     114    14660804 :                 n--;
     115             : 
     116    14660804 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
     117    14660804 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
     118             : 
     119    14695944 :                 if (c1 == INVALID_CODEPOINT ||
     120       35140 :                     c2 == INVALID_CODEPOINT) {
     121             :                         /*
     122             :                          * n was specified in characters,
     123             :                          * now we must convert it to bytes.
     124             :                          * As bytes are the smallest
     125             :                          * character unit, the following
     126             :                          * increment and strncasecmp is always
     127             :                          * safe.
     128             :                          *
     129             :                          * The source string was already known
     130             :                          * to be n characters long, so we are
     131             :                          * guaranteed to be able to look at the
     132             :                          * (n remaining + size1) bytes from the
     133             :                          * s1 position).
     134             :                          */
     135           1 :                         n += size1;
     136           1 :                         return strncasecmp(s1, s2, n);
     137             :                 }
     138             : 
     139    14660803 :                 s1 += size1;
     140    14660803 :                 s2 += size2;
     141             : 
     142    14660803 :                 if (c1 == c2) {
     143    11171229 :                         continue;
     144             :                 }
     145             : 
     146     3489574 :                 u1 = toupper_m(c1);
     147     3489574 :                 u2 = toupper_m(c2);
     148     3489574 :                 if (u1 == u2) {
     149       23614 :                         continue;
     150             :                 }
     151             : 
     152     3465960 :                 l1 = tolower_m(c1);
     153     3465960 :                 l2 = tolower_m(c2);
     154     3465960 :                 if (l1 == l2) {
     155           0 :                         continue;
     156             :                 }
     157             : 
     158     3465960 :                 return l1 - l2;
     159             :         }
     160             : 
     161      847236 :         if (n == 0) {
     162      839109 :                 return 0;
     163             :         }
     164             : 
     165        4612 :         return *s1 - *s2;
     166             : }
     167             : 
     168             : /**
     169             :  Case insensitive string compararison, length limited
     170             : **/
     171     4313319 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
     172             : {
     173     4313319 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
     174     4313319 :         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
     175             : }
     176             : 
     177             : /**
     178             :  * Compare 2 strings.
     179             :  *
     180             :  * @note The comparison is case-insensitive.
     181             :  **/
     182       73809 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
     183             : {
     184       73809 :         return strcasecmp_m(s1,s2) == 0;
     185             : }
     186             : 
     187             : /**
     188             :  Compare 2 strings (case sensitive).
     189             : **/
     190    12905851 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
     191             : {
     192    12905851 :         if (s1 == s2)
     193          48 :                 return true;
     194    12905793 :         if (!s1 || !s2)
     195           0 :                 return false;
     196             : 
     197    12905791 :         return strcmp(s1,s2) == 0;
     198             : }
     199             : 
     200             : /**
     201             :  * Calculate the number of units (8 or 16-bit, depending on the
     202             :  * destination charset), that would be needed to convert the input
     203             :  * string which is expected to be in in src_charset encoding to the
     204             :  * destination charset (which should be a unicode charset).
     205             :  */
     206    28770897 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
     207             :                                     const char *s, charset_t src_charset, charset_t dst_charset)
     208             : {
     209    28770897 :         size_t count = 0;
     210             : 
     211             : #ifdef DEVELOPER
     212    28770897 :         switch (dst_charset) {
     213           0 :         case CH_DOS:
     214             :         case CH_UNIX:
     215           0 :                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
     216    28088180 :         default:
     217    28088180 :                 break;
     218             :         }
     219             : 
     220    28770897 :         switch (src_charset) {
     221           0 :         case CH_UTF16LE:
     222             :         case CH_UTF16BE:
     223           0 :                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
     224    28088180 :         default:
     225    28088180 :                 break;
     226             :         }
     227             : #endif
     228    28770897 :         if (!s) {
     229       69279 :                 return 0;
     230             :         }
     231             : 
     232   946949442 :         while (*s && !(((uint8_t)*s) & 0x80)) {
     233   893444516 :                 s++;
     234   893444516 :                 count++;
     235             :         }
     236             : 
     237    28697850 :         if (!*s) {
     238    28016158 :                 return count;
     239             :         }
     240             : 
     241      306093 :         while (*s) {
     242             :                 size_t c_size;
     243      301329 :                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
     244             :                                                           src_charset, &c_size);
     245      301329 :                 s += c_size;
     246             : 
     247      301329 :                 switch (dst_charset) {
     248      293262 :                 case CH_UTF16LE:
     249             :                 case CH_UTF16BE:
     250             :                 case CH_UTF16MUNGED:
     251      293262 :                         if (c < 0x10000) {
     252             :                                 /* Unicode char fits into 16 bits. */
     253      293259 :                                 count += 1;
     254             :                         } else {
     255             :                                 /* Double-width unicode char - 32 bits. */
     256           3 :                                 count += 2;
     257             :                         }
     258      290971 :                         break;
     259        8067 :                 case CH_UTF8:
     260             :                         /*
     261             :                          * this only checks ranges, and does not
     262             :                          * check for invalid codepoints
     263             :                          */
     264        8067 :                         if (c < 0x80) {
     265        6152 :                                 count += 1;
     266        1915 :                         } else if (c < 0x800) {
     267         871 :                                 count += 2;
     268        1044 :                         } else if (c < 0x10000) {
     269        1044 :                                 count += 3;
     270             :                         } else {
     271           0 :                                 count += 4;
     272             :                         }
     273        6822 :                         break;
     274           0 :                 default:
     275             :                         /*
     276             :                          * non-unicode encoding:
     277             :                          * assume that each codepoint fits into
     278             :                          * one unit in the destination encoding.
     279             :                          */
     280           0 :                         count += 1;
     281             :                 }
     282             :         }
     283             : 
     284        2743 :         return count;
     285             : }
     286             : 
     287             : /**
     288             :  * Calculate the number of units (8 or 16-bit, depending on the
     289             :  * destination charset), that would be needed to convert the input
     290             :  * string which is expected to be in in src_charset encoding to the
     291             :  * destination charset (which should be a unicode charset).
     292             :  */
     293    28770885 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
     294             : {
     295    28770885 :         struct smb_iconv_handle *ic = get_iconv_handle();
     296    28770885 :         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
     297             : }
     298             : 
     299    16211225 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
     300             :                                   const charset_t dst_charset)
     301             : {
     302    16211225 :         if (!s) {
     303      219118 :                 return 0;
     304             :         }
     305    15990136 :         return strlen_m_ext(s, src_charset, dst_charset) + 1;
     306             : }
     307             : 
     308      855742 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
     309             :                                        const charset_t src_charset,
     310             :                                        const charset_t dst_charset)
     311             : {
     312             :         size_t len;
     313      855742 :         if (!s) {
     314        1272 :                 return 0;
     315             :         }
     316      854469 :         len = strlen_m_ext(s, src_charset, dst_charset);
     317      854469 :         if (len == 0) {
     318      602642 :                 return 0;
     319             :         }
     320             : 
     321      251526 :         return len+1;
     322             : }
     323             : 
     324             : /**
     325             :  * Calculate the number of 16-bit units that would be needed to convert
     326             :  * the input string which is expected to be in CH_UNIX encoding to UTF16.
     327             :  *
     328             :  * This will be the same as the number of bytes in a string for single
     329             :  * byte strings, but will be different for multibyte.
     330             :  */
     331    11926274 : _PUBLIC_ size_t strlen_m(const char *s)
     332             : {
     333    11926274 :         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
     334             : }
     335             : 
     336             : /**
     337             :    Work out the number of multibyte chars in a string, including the NULL
     338             :    terminator.
     339             : **/
     340     2102729 : _PUBLIC_ size_t strlen_m_term(const char *s)
     341             : {
     342     2102729 :         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
     343             : }
     344             : 
     345             : /*
     346             :  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
     347             :  * if a string is there, include the terminator.
     348             :  */
     349             : 
     350      855742 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
     351             : {
     352      855742 :         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
     353             : }
     354             : 
     355             : /**
     356             :  Strchr and strrchr_m are a bit complex on general multi-byte strings.
     357             : **/
     358   194900098 : _PUBLIC_ char *strchr_m(const char *src, char c)
     359             : {
     360             :         const char *s;
     361   194900098 :         struct smb_iconv_handle *ic = get_iconv_handle();
     362   194900098 :         if (src == NULL) {
     363           0 :                 return NULL;
     364             :         }
     365             :         /* characters below 0x3F are guaranteed to not appear in
     366             :            non-initial position in multi-byte charsets */
     367   194900098 :         if ((c & 0xC0) == 0) {
     368    57935719 :                 return strchr(src, c);
     369             :         }
     370             : 
     371             :         /* this is quite a common operation, so we want it to be
     372             :            fast. We optimise for the ascii case, knowing that all our
     373             :            supported multi-byte character sets are ascii-compatible
     374             :            (ie. they match for the first 128 chars) */
     375             : 
     376   948894057 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     377   812944683 :                 if (*s == c)
     378       74609 :                         return discard_const_p(char, s);
     379             :         }
     380             : 
     381   136887793 :         if (!*s)
     382   135951351 :                 return NULL;
     383             : 
     384             : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
     385             :         /* With compose characters we must restart from the beginning. JRA. */
     386             :         s = src;
     387             : #endif
     388             : 
     389           4 :         while (*s) {
     390             :                 size_t size;
     391           3 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     392           3 :                 if (c2 == c) {
     393           0 :                         return discard_const_p(char, s);
     394             :                 }
     395           3 :                 s += size;
     396             :         }
     397             : 
     398           0 :         return NULL;
     399             : }
     400             : 
     401             : /**
     402             :  * Multibyte-character version of strrchr
     403             :  */
     404     5905733 : _PUBLIC_ char *strrchr_m(const char *s, char c)
     405             : {
     406             :         struct smb_iconv_handle *ic;
     407     5905733 :         char *ret = NULL;
     408             : 
     409     5905733 :         if (s == NULL) {
     410           0 :                 return NULL;
     411             :         }
     412             : 
     413             :         /* characters below 0x3F are guaranteed to not appear in
     414             :            non-initial position in multi-byte charsets */
     415     5905733 :         if ((c & 0xC0) == 0) {
     416     5885759 :                 return strrchr(s, c);
     417             :         }
     418             : 
     419             :         /* this is quite a common operation, so we want it to be
     420             :            fast. We optimise for the ascii case, knowing that all our
     421             :            supported multi-byte character sets are ascii-compatible
     422             :            (ie. they match for the first 128 chars). Also, in Samba
     423             :            we only search for ascii characters in 'c' and that
     424             :            in all mb character sets with a compound character
     425             :            containing c, if 'c' is not a match at position
     426             :            p, then p[-1] > 0x7f. JRA. */
     427             : 
     428             :         {
     429       19974 :                 size_t len = strlen(s);
     430       19974 :                 const char *cp = s;
     431       19974 :                 bool got_mb = false;
     432             : 
     433       19974 :                 if (len == 0)
     434          86 :                         return NULL;
     435       19888 :                 cp += (len - 1);
     436             :                 do {
     437      147157 :                         if (c == *cp) {
     438             :                                 /* Could be a match. Part of a multibyte ? */
     439       32627 :                                 if ((cp > s) &&
     440       15755 :                                         (((unsigned char)cp[-1]) & 0x80)) {
     441             :                                         /* Yep - go slow :-( */
     442           0 :                                         got_mb = true;
     443           0 :                                         break;
     444             :                                 }
     445             :                                 /* No - we have a match ! */
     446       17150 :                                 return discard_const_p(char , cp);
     447             :                         }
     448      129763 :                 } while (cp-- != s);
     449        2434 :                 if (!got_mb)
     450        2434 :                         return NULL;
     451             :         }
     452             : 
     453           0 :         ic = get_iconv_handle();
     454             : 
     455           0 :         while (*s) {
     456             :                 size_t size;
     457           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     458           0 :                 if (c2 == c) {
     459           0 :                         ret = discard_const_p(char, s);
     460             :                 }
     461           0 :                 s += size;
     462             :         }
     463             : 
     464           0 :         return ret;
     465             : }
     466             : 
     467             : /**
     468             :   return True if any (multi-byte) character is lower case
     469             : */
     470          35 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
     471             :                                  const char *string)
     472             : {
     473         998 :         while (*string) {
     474             :                 size_t c_size;
     475             :                 codepoint_t s;
     476             :                 codepoint_t t;
     477             : 
     478         950 :                 s = next_codepoint_handle(ic, string, &c_size);
     479         950 :                 string += c_size;
     480             : 
     481         950 :                 t = toupper_m(s);
     482             : 
     483         950 :                 if (s != t) {
     484          22 :                         return true; /* that means it has lower case chars */
     485             :                 }
     486             :         }
     487             : 
     488           0 :         return false;
     489             : }
     490             : 
     491          17 : _PUBLIC_ bool strhaslower(const char *string)
     492             : {
     493          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     494          17 :         return strhaslower_handle(ic, string);
     495             : }
     496             : 
     497             : /**
     498             :   return True if any (multi-byte) character is upper case
     499             : */
     500          35 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
     501             :                                  const char *string)
     502             : {
     503         989 :         while (*string) {
     504             :                 size_t c_size;
     505             :                 codepoint_t s;
     506             :                 codepoint_t t;
     507             : 
     508         941 :                 s = next_codepoint_handle(ic, string, &c_size);
     509         941 :                 string += c_size;
     510             : 
     511         941 :                 t = tolower_m(s);
     512             : 
     513         941 :                 if (s != t) {
     514          22 :                         return true; /* that means it has upper case chars */
     515             :                 }
     516             :         }
     517             : 
     518           0 :         return false;
     519             : }
     520             : 
     521          17 : _PUBLIC_ bool strhasupper(const char *string)
     522             : {
     523          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     524          17 :         return strhasupper_handle(ic, string);
     525             : }
     526             : 
     527             : /***********************************************************************
     528             :  strstr_m - We convert via ucs2 for now.
     529             : ***********************************************************************/
     530             : 
     531     1369503 : char *strstr_m(const char *src, const char *findstr)
     532             : {
     533     1369503 :         TALLOC_CTX *mem_ctx = NULL;
     534             :         smb_ucs2_t *p;
     535             :         smb_ucs2_t *src_w, *find_w;
     536             :         const char *s;
     537             :         char *s2;
     538     1369503 :         char *retp = NULL;
     539     1369503 :         size_t converted_size, findstr_len = 0;
     540             : 
     541             :         /* for correctness */
     542     1369503 :         if (!findstr[0]) {
     543           0 :                 return discard_const_p(char, src);
     544             :         }
     545             : 
     546             :         /* Samba does single character findstr calls a *lot*. */
     547     1369501 :         if (findstr[1] == '\0')
     548       91073 :                 return strchr_m(src, *findstr);
     549             : 
     550             :         /* We optimise for the ascii case, knowing that all our
     551             :            supported multi-byte character sets are ascii-compatible
     552             :            (ie. they match for the first 128 chars) */
     553             : 
     554    34105809 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     555    33224757 :                 if (*s == *findstr) {
     556     1872210 :                         if (!findstr_len)
     557      767928 :                                 findstr_len = strlen(findstr);
     558             : 
     559     1872210 :                         if (strncmp(s, findstr, findstr_len) == 0) {
     560      387929 :                                 return discard_const_p(char, s);
     561             :                         }
     562             :                 }
     563             :         }
     564             : 
     565      887390 :         if (!*s)
     566      884161 :                 return NULL;
     567             : 
     568             : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
     569             :         /* 'make check' fails unless we do this */
     570             : 
     571             :         /* With compose characters we must restart from the beginning. JRA. */
     572           9 :         s = src;
     573             : #endif
     574             : 
     575             :         /*
     576             :          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
     577             :          * case we leak memory, this should then be more obvious in
     578             :          * the talloc report.
     579             :          */
     580           9 :         mem_ctx = talloc_new(get_iconv_handle());
     581           9 :         if (mem_ctx == NULL) {
     582           0 :                 return NULL;
     583             :         }
     584             : 
     585           9 :         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
     586           0 :                 goto done;
     587             :         }
     588             : 
     589           9 :         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
     590           0 :                 goto done;
     591             :         }
     592             : 
     593           6 :         p = strstr_w(src_w, find_w);
     594             : 
     595           6 :         if (!p) {
     596           0 :                 goto done;
     597             :         }
     598             : 
     599           3 :         *p = 0;
     600           3 :         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
     601           0 :                 goto done;
     602             :         }
     603           3 :         retp = discard_const_p(char, (s+strlen(s2)));
     604          15 : done:
     605           9 :         TALLOC_FREE(mem_ctx);
     606           9 :         return retp;
     607             : }

Generated by: LCOV version 1.13