LCOV - code coverage report
Current view: top level - source4/heimdal/lib/wind - utf8.c (source / functions) Hit Total Coverage
Test: coverage report for abartlet/fix-coverage dd10fb34 Lines: 97 197 49.2 %
Date: 2021-09-23 10:06:22 Functions: 9 11 81.8 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
       3             :  * (Royal Institute of Technology, Stockholm, Sweden).
       4             :  * All rights reserved.
       5             :  *
       6             :  * Redistribution and use in source and binary forms, with or without
       7             :  * modification, are permitted provided that the following conditions
       8             :  * are met:
       9             :  *
      10             :  * 1. Redistributions of source code must retain the above copyright
      11             :  *    notice, this list of conditions and the following disclaimer.
      12             :  *
      13             :  * 2. Redistributions in binary form must reproduce the above copyright
      14             :  *    notice, this list of conditions and the following disclaimer in the
      15             :  *    documentation and/or other materials provided with the distribution.
      16             :  *
      17             :  * 3. Neither the name of the Institute nor the names of its contributors
      18             :  *    may be used to endorse or promote products derived from this software
      19             :  *    without specific prior written permission.
      20             :  *
      21             :  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
      22             :  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      23             :  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      24             :  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
      25             :  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
      26             :  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
      27             :  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
      28             :  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      29             :  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
      30             :  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
      31             :  * SUCH DAMAGE.
      32             :  */
      33             : 
      34             : #include <config.h>
      35             : #include "windlocl.h"
      36             : 
      37             : static int
      38     3120792 : utf8toutf32(const unsigned char **pp, uint32_t *out)
      39             : {
      40     3120792 :     const unsigned char *p = *pp;
      41     3120792 :     unsigned c = *p;
      42             : 
      43     3120792 :     if (c & 0x80) {
      44      928254 :         if ((c & 0xE0) == 0xC0) {
      45       28686 :             const unsigned c2 = *++p;
      46       28686 :             if ((c2 & 0xC0) == 0x80) {
      47       57372 :                 *out =  ((c  & 0x1F) << 6)
      48       28686 :                     | (c2 & 0x3F);
      49             :             } else {
      50           0 :                 return WIND_ERR_INVALID_UTF8;
      51             :             }
      52      899568 :         } else if ((c & 0xF0) == 0xE0) {
      53      899568 :             const unsigned c2 = *++p;
      54      899568 :             if ((c2 & 0xC0) == 0x80) {
      55      899568 :                 const unsigned c3 = *++p;
      56      899568 :                 if ((c3 & 0xC0) == 0x80) {
      57     1799136 :                     *out =   ((c  & 0x0F) << 12)
      58      899568 :                         | ((c2 & 0x3F) << 6)
      59      899568 :                         |  (c3 & 0x3F);
      60             :                 } else {
      61           0 :                     return WIND_ERR_INVALID_UTF8;
      62             :                 }
      63             :             } else {
      64           0 :                 return WIND_ERR_INVALID_UTF8;
      65             :             }
      66           0 :         } else if ((c & 0xF8) == 0xF0) {
      67           0 :             const unsigned c2 = *++p;
      68           0 :             if ((c2 & 0xC0) == 0x80) {
      69           0 :                 const unsigned c3 = *++p;
      70           0 :                 if ((c3 & 0xC0) == 0x80) {
      71           0 :                     const unsigned c4 = *++p;
      72           0 :                     if ((c4 & 0xC0) == 0x80) {
      73           0 :                         *out =   ((c  & 0x07) << 18)
      74           0 :                             | ((c2 & 0x3F) << 12)
      75           0 :                             | ((c3 & 0x3F) <<  6)
      76           0 :                             |  (c4 & 0x3F);
      77             :                     } else {
      78           0 :                         return WIND_ERR_INVALID_UTF8;
      79             :                     }
      80             :                 } else {
      81           0 :                     return WIND_ERR_INVALID_UTF8;
      82             :                 }
      83             :             } else {
      84           0 :                 return WIND_ERR_INVALID_UTF8;
      85             :             }
      86             :         } else {
      87           0 :             return WIND_ERR_INVALID_UTF8;
      88             :         }
      89             :     } else {
      90     2192538 :         *out = c;
      91             :     }
      92             : 
      93     3120792 :     *pp = p;
      94             : 
      95     3120792 :     return 0;
      96             : }
      97             : 
      98             : /**
      99             :  * Convert an UTF-8 string to an UCS4 string.
     100             :  *
     101             :  * @param in an UTF-8 string to convert.
     102             :  * @param out the resulting UCS4 strint, must be at least
     103             :  * wind_utf8ucs4_length() long.  If out is NULL, the function will
     104             :  * calculate the needed space for the out variable (just like
     105             :  * wind_utf8ucs4_length()).
     106             :  * @param out_len before processing out_len should be the length of
     107             :  * the out variable, after processing it will be the length of the out
     108             :  * string.
     109             :  *
     110             :  * @return returns 0 on success, an wind error code otherwise
     111             :  * @ingroup wind
     112             :  */
     113             : 
     114             : int
     115        3600 : wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
     116             : {
     117             :     const unsigned char *p;
     118        3600 :     size_t o = 0;
     119             :     int ret;
     120             : 
     121      115200 :     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
     122             :         uint32_t u;
     123             : 
     124       54000 :         ret = utf8toutf32(&p, &u);
     125       54000 :         if (ret)
     126           0 :             return ret;
     127             : 
     128       54000 :         if (out) {
     129       27000 :             if (o >= *out_len)
     130           0 :                 return WIND_ERR_OVERRUN;
     131       27000 :             out[o] = u;
     132             :         }
     133       54000 :         o++;
     134             :     }
     135        3600 :     *out_len = o;
     136        3600 :     return 0;
     137             : }
     138             : 
     139             : /**
     140             :  * Calculate the length of from converting a UTF-8 string to a UCS4
     141             :  * string.
     142             :  *
     143             :  * @param in an UTF-8 string to convert.
     144             :  * @param out_len the length of the resulting UCS4 string.
     145             :  *
     146             :  * @return returns 0 on success, an wind error code otherwise
     147             :  * @ingroup wind
     148             :  */
     149             : 
     150             : int
     151        1800 : wind_utf8ucs4_length(const char *in, size_t *out_len)
     152             : {
     153        1800 :     return wind_utf8ucs4(in, NULL, out_len);
     154             : }
     155             : 
     156             : static const char first_char[4] =
     157             :     { 0x00, 0xC0, 0xE0, 0xF0 };
     158             : 
     159             : /**
     160             :  * Convert an UCS4 string to a UTF-8 string.
     161             :  *
     162             :  * @param in an UCS4 string to convert.
     163             :  * @param in_len the length input array.
     164             : 
     165             :  * @param out the resulting UTF-8 strint, must be at least
     166             :  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
     167             :  * out is NULL, the function will calculate the needed space for the
     168             :  * out variable (just like wind_ucs4utf8_length()).
     169             : 
     170             :  * @param out_len before processing out_len should be the length of
     171             :  * the out variable, after processing it will be the length of the out
     172             :  * string.
     173             :  *
     174             :  * @return returns 0 on success, an wind error code otherwise
     175             :  * @ingroup wind
     176             :  */
     177             : 
     178             : int
     179           0 : wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
     180             : {
     181             :     uint32_t ch;
     182             :     size_t i, len, o;
     183             : 
     184           0 :     for (o = 0, i = 0; i < in_len; i++) {
     185           0 :         ch = in[i];
     186             : 
     187           0 :         if (ch < 0x80) {
     188           0 :             len = 1;
     189           0 :         } else if (ch < 0x800) {
     190           0 :             len = 2;
     191           0 :         } else if (ch < 0x10000) {
     192           0 :             len = 3;
     193           0 :         } else if (ch <= 0x10FFFF) {
     194           0 :             len = 4;
     195             :         } else
     196           0 :             return WIND_ERR_INVALID_UTF32;
     197             : 
     198           0 :         o += len;
     199             : 
     200           0 :         if (out) {
     201           0 :             if (o >= *out_len)
     202           0 :                 return WIND_ERR_OVERRUN;
     203             : 
     204           0 :             switch(len) {
     205           0 :             case 4:
     206           0 :                 out[3] = (ch | 0x80) & 0xbf;
     207           0 :                 ch = ch >> 6;
     208           0 :             case 3:
     209           0 :                 out[2] = (ch | 0x80) & 0xbf;
     210           0 :                 ch = ch >> 6;
     211           0 :             case 2:
     212           0 :                 out[1] = (ch | 0x80) & 0xbf;
     213           0 :                 ch = ch >> 6;
     214           0 :             case 1:
     215           0 :                 out[0] = ch | first_char[len - 1];
     216             :             }
     217             :         }
     218           0 :         out += len;
     219             :     }
     220           0 :     if (out) {
     221           0 :         if (o + 1 >= *out_len)
     222           0 :             return WIND_ERR_OVERRUN;
     223           0 :         *out = '\0';
     224             :     }
     225           0 :     *out_len = o;
     226           0 :     return 0;
     227             : }
     228             : 
     229             : /**
     230             :  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
     231             :  *
     232             :  * @param in an UCS4 string to convert.
     233             :  * @param in_len the length of UCS4 string to convert.
     234             :  * @param out_len the length of the resulting UTF-8 string.
     235             :  *
     236             :  * @return returns 0 on success, an wind error code otherwise
     237             :  * @ingroup wind
     238             :  */
     239             : 
     240             : int
     241           0 : wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
     242             : {
     243           0 :     return wind_ucs4utf8(in, in_len, NULL, out_len);
     244             : }
     245             : 
     246             : /**
     247             :  * Read in an UCS2 from a buffer.
     248             :  *
     249             :  * @param ptr The input buffer to read from.
     250             :  * @param len the length of the input buffer.
     251             :  * @param flags Flags to control the behavior of the function.
     252             :  * @param out the output UCS2, the array must be at least out/2 long.
     253             :  * @param out_len the output length
     254             :  *
     255             :  * @return returns 0 on success, an wind error code otherwise.
     256             :  * @ingroup wind
     257             :  */
     258             : 
     259             : int
     260       86081 : wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
     261             :               uint16_t *out, size_t *out_len)
     262             : {
     263       86081 :     const unsigned char *p = ptr;
     264       86081 :     int little = ((*flags) & WIND_RW_LE);
     265       86081 :     size_t olen = *out_len;
     266             : 
     267             :     /** if len is zero, flags are unchanged */
     268       86081 :     if (len == 0) {
     269           0 :         *out_len = 0;
     270           0 :         return 0;
     271             :     }
     272             : 
     273             :     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
     274       86081 :     if (len & 1)
     275           0 :         return WIND_ERR_LENGTH_NOT_MOD2;
     276             : 
     277             :     /**
     278             :      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
     279             :      * found, check is LE/BE flag is already and use that otherwise
     280             :      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
     281             :      * the LE/BE flag and set the resulting LE/BE flag.
     282             :      */
     283       86081 :     if ((*flags) & WIND_RW_BOM) {
     284           0 :         uint16_t bom = (p[0] << 8) + p[1];
     285           0 :         if (bom == 0xfffe || bom == 0xfeff) {
     286           0 :             little = (bom == 0xfffe);
     287           0 :             p += 2;
     288           0 :             len -= 2;
     289           0 :         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
     290             :             /* little already set */
     291             :         } else
     292           0 :             return WIND_ERR_NO_BOM;
     293           0 :         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
     294           0 :         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
     295             :     }
     296             : 
     297     1277760 :     while (len) {
     298     1107425 :         if (olen < 1)
     299           0 :             return WIND_ERR_OVERRUN;
     300     1107425 :         if (little)
     301     1107425 :             *out = (p[1] << 8) + p[0];
     302             :         else
     303           0 :             *out = (p[0] << 8) + p[1];
     304     1107425 :         out++; p += 2; len -= 2; olen--;
     305             :     }
     306       86081 :     *out_len -= olen;
     307       86081 :     return 0;
     308             : }
     309             : 
     310             : /**
     311             :  * Write an UCS2 string to a buffer.
     312             :  *
     313             :  * @param in The input UCS2 string.
     314             :  * @param in_len the length of the input buffer.
     315             :  * @param flags Flags to control the behavior of the function.
     316             :  * @param ptr The input buffer to write to, the array must be at least
     317             :  * (in + 1) * 2 bytes long.
     318             :  * @param out_len the output length
     319             :  *
     320             :  * @return returns 0 on success, an wind error code otherwise.
     321             :  * @ingroup wind
     322             :  */
     323             : 
     324             : int
     325       66861 : wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
     326             :                void *ptr, size_t *out_len)
     327             : {
     328       66861 :     unsigned char *p = ptr;
     329       66861 :     size_t len = *out_len;
     330             : 
     331             :     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
     332       66861 :     if (len & 1)
     333           0 :         return WIND_ERR_LENGTH_NOT_MOD2;
     334             : 
     335             :     /** On zero input length, flags are preserved */
     336       66861 :     if (in_len == 0) {
     337           0 :         *out_len = 0;
     338           0 :         return 0;
     339             :     }
     340             :     /** If flags have WIND_RW_BOM set, the byte order mark is written
     341             :      * first to the output data */
     342       66861 :     if ((*flags) & WIND_RW_BOM) {
     343           0 :         uint16_t bom = 0xfffe;
     344             : 
     345           0 :         if (len < 2)
     346           0 :             return WIND_ERR_OVERRUN;
     347             : 
     348           0 :         if ((*flags) & WIND_RW_LE) {
     349           0 :             p[0] = (bom     ) & 0xff;
     350           0 :             p[1] = (bom >> 8) & 0xff;
     351             :         } else {
     352           0 :             p[1] = (bom     ) & 0xff;
     353           0 :             p[0] = (bom >> 8) & 0xff;
     354             :         }
     355           0 :         len -= 2;
     356             :     }
     357             : 
     358     1001735 :     while (in_len) {
     359             :         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
     360      869692 :         if (len < 2)
     361           0 :             return WIND_ERR_OVERRUN;
     362      869692 :         if ((*flags) & WIND_RW_LE) {
     363      869692 :             p[0] = (in[0]     ) & 0xff;
     364      869692 :             p[1] = (in[0] >> 8) & 0xff;
     365             :         } else {
     366           0 :             p[1] = (in[0]     ) & 0xff;
     367           0 :             p[0] = (in[0] >> 8) & 0xff;
     368             :         }
     369      869692 :         len -= 2;
     370      869692 :         in_len--;
     371      869692 :         p += 2;
     372      869692 :         in++;
     373             :     }
     374       66861 :     *out_len -= len;
     375       66861 :     return 0;
     376             : }
     377             : 
     378             : 
     379             : /**
     380             :  * Convert an UTF-8 string to an UCS2 string.
     381             :  *
     382             :  * @param in an UTF-8 string to convert.
     383             :  * @param out the resulting UCS2 strint, must be at least
     384             :  * wind_utf8ucs2_length() long.  If out is NULL, the function will
     385             :  * calculate the needed space for the out variable (just like
     386             :  * wind_utf8ucs2_length()).
     387             :  * @param out_len before processing out_len should be the length of
     388             :  * the out variable, after processing it will be the length of the out
     389             :  * string.
     390             :  *
     391             :  * @return returns 0 on success, an wind error code otherwise
     392             :  * @ingroup wind
     393             :  */
     394             : 
     395             : int
     396      153052 : wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
     397             : {
     398             :     const unsigned char *p;
     399      153052 :     size_t o = 0;
     400             :     int ret;
     401             : 
     402     6439688 :     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
     403             :         uint32_t u;
     404             : 
     405     3066792 :         ret = utf8toutf32(&p, &u);
     406     3066792 :         if (ret)
     407           0 :             return ret;
     408             : 
     409     3066792 :         if (u & 0xffff0000)
     410           0 :             return WIND_ERR_NOT_UTF16;
     411             : 
     412     3066792 :         if (out) {
     413     1533396 :             if (o >= *out_len)
     414           0 :                 return WIND_ERR_OVERRUN;
     415     1533396 :             out[o] = u;
     416             :         }
     417     3066792 :         o++;
     418             :     }
     419      153052 :     *out_len = o;
     420      153052 :     return 0;
     421             : }
     422             : 
     423             : /**
     424             :  * Calculate the length of from converting a UTF-8 string to a UCS2
     425             :  * string.
     426             :  *
     427             :  * @param in an UTF-8 string to convert.
     428             :  * @param out_len the length of the resulting UCS4 string.
     429             :  *
     430             :  * @return returns 0 on success, an wind error code otherwise
     431             :  * @ingroup wind
     432             :  */
     433             : 
     434             : int
     435       76526 : wind_utf8ucs2_length(const char *in, size_t *out_len)
     436             : {
     437       76526 :     return wind_utf8ucs2(in, NULL, out_len);
     438             : }
     439             : 
     440             : /**
     441             :  * Convert an UCS2 string to a UTF-8 string.
     442             :  *
     443             :  * @param in an UCS2 string to convert.
     444             :  * @param in_len the length of the in UCS2 string.
     445             :  * @param out the resulting UTF-8 strint, must be at least
     446             :  * wind_ucs2utf8_length() long.  If out is NULL, the function will
     447             :  * calculate the needed space for the out variable (just like
     448             :  * wind_ucs2utf8_length()).
     449             :  * @param out_len before processing out_len should be the length of
     450             :  * the out variable, after processing it will be the length of the out
     451             :  * string.
     452             :  *
     453             :  * @return returns 0 on success, an wind error code otherwise
     454             :  * @ingroup wind
     455             :  */
     456             : 
     457             : int
     458      172162 : wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
     459             : {
     460             :     uint16_t ch;
     461             :     size_t i, len, o;
     462             : 
     463     2387012 :     for (o = 0, i = 0; i < in_len; i++) {
     464     2214850 :         ch = in[i];
     465             : 
     466     2214850 :         if (ch < 0x80) {
     467     2170140 :             len = 1;
     468           0 :         } else if (ch < 0x800) {
     469           0 :             len = 2;
     470             :         } else
     471           0 :             len = 3;
     472             : 
     473     2214850 :         o += len;
     474             : 
     475     2214850 :         if (out) {
     476     1107425 :             if (o >= *out_len)
     477           0 :                 return WIND_ERR_OVERRUN;
     478             : 
     479     1107425 :             switch(len) {
     480           0 :             case 3:
     481           0 :                 out[2] = (ch | 0x80) & 0xbf;
     482           0 :                 ch = ch >> 6;
     483           0 :             case 2:
     484           0 :                 out[1] = (ch | 0x80) & 0xbf;
     485           0 :                 ch = ch >> 6;
     486     1107425 :             case 1:
     487     1107425 :                 out[0] = ch | first_char[len - 1];
     488             :             }
     489     1107425 :             out += len;
     490             :         }
     491             :     }
     492      172162 :     if (out) {
     493       86081 :         if (o >= *out_len)
     494           0 :             return WIND_ERR_OVERRUN;
     495       86081 :         *out = '\0';
     496             :     }
     497      172162 :     *out_len = o;
     498      172162 :     return 0;
     499             : }
     500             : 
     501             : /**
     502             :  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
     503             :  *
     504             :  * @param in an UCS2 string to convert.
     505             :  * @param in_len an UCS2 string length to convert.
     506             :  * @param out_len the length of the resulting UTF-8 string.
     507             :  *
     508             :  * @return returns 0 on success, an wind error code otherwise
     509             :  * @ingroup wind
     510             :  */
     511             : 
     512             : int
     513       86081 : wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
     514             : {
     515       86081 :     return wind_ucs2utf8(in, in_len, NULL, out_len);
     516             : }

Generated by: LCOV version 1.13