LCOV - code coverage report
Current view: top level - source4/heimdal/lib/wind - normalize.c (source / functions) Hit Total Coverage
Test: coverage report for abartlet/fix-coverage dd10fb34 Lines: 82 155 52.9 %
Date: 2021-09-23 10:06:22 Functions: 8 10 80.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2004 Kungliga Tekniska Högskolan
       3             :  * (Royal Institute of Technology, Stockholm, Sweden).
       4             :  * All rights reserved.
       5             :  *
       6             :  * Redistribution and use in source and binary forms, with or without
       7             :  * modification, are permitted provided that the following conditions
       8             :  * are met:
       9             :  *
      10             :  * 1. Redistributions of source code must retain the above copyright
      11             :  *    notice, this list of conditions and the following disclaimer.
      12             :  *
      13             :  * 2. Redistributions in binary form must reproduce the above copyright
      14             :  *    notice, this list of conditions and the following disclaimer in the
      15             :  *    documentation and/or other materials provided with the distribution.
      16             :  *
      17             :  * 3. Neither the name of the Institute nor the names of its contributors
      18             :  *    may be used to endorse or promote products derived from this software
      19             :  *    without specific prior written permission.
      20             :  *
      21             :  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
      22             :  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      23             :  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      24             :  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
      25             :  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
      26             :  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
      27             :  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
      28             :  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      29             :  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
      30             :  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
      31             :  * SUCH DAMAGE.
      32             :  */
      33             : 
      34             : #ifdef HAVE_CONFIG_H
      35             : #include <config.h>
      36             : #endif
      37             : #include "windlocl.h"
      38             : 
      39             : #include <assert.h>
      40             : #include <stdlib.h>
      41             : #include <errno.h>
      42             : #include <stdio.h>
      43             : 
      44             : #include "roken.h"
      45             : 
      46             : #include "normalize_table.h"
      47             : 
      48             : static int
      49      538200 : translation_cmp(const void *key, const void *data)
      50             : {
      51      538200 :     const struct translation *t1 = (const struct translation *)key;
      52      538200 :     const struct translation *t2 = (const struct translation *)data;
      53             : 
      54      538200 :     return t1->key - t2->key;
      55             : }
      56             : 
      57             : enum { s_base  = 0xAC00};
      58             : enum { s_count = 11172};
      59             : enum { l_base  = 0x1100};
      60             : enum { l_count = 19};
      61             : enum { v_base  = 0x1161};
      62             : enum { v_count = 21};
      63             : enum { t_base  = 0x11A7};
      64             : enum { t_count = 28};
      65             : enum { n_count = v_count * t_count};
      66             : 
      67             : static int
      68       41400 : hangul_decomp(const uint32_t *in, size_t in_len,
      69             :               uint32_t *out, size_t *out_len)
      70             : {
      71       41400 :     uint32_t u = *in;
      72             :     unsigned s_index;
      73             :     unsigned l, v, t;
      74             :     unsigned o;
      75             : 
      76       41400 :     if (u < s_base || u >= s_base + s_count)
      77       41400 :         return 0;
      78           0 :     s_index = u - s_base;
      79           0 :     l = l_base + s_index / n_count;
      80           0 :     v = v_base + (s_index % n_count) / t_count;
      81           0 :     t = t_base + s_index % t_count;
      82           0 :     o = 2;
      83           0 :     if (t != t_base)
      84           0 :         ++o;
      85           0 :     if (*out_len < o)
      86           0 :         return WIND_ERR_OVERRUN;
      87           0 :     out[0] = l;
      88           0 :     out[1] = v;
      89           0 :     if (t != t_base)
      90           0 :         out[2] = t;
      91           0 :     *out_len = o;
      92           0 :     return 1;
      93             : }
      94             : 
      95             : static uint32_t
      96       38880 : hangul_composition(const uint32_t *in, size_t in_len)
      97             : {
      98       38880 :     if (in_len < 2)
      99           0 :         return 0;
     100       38880 :     if (in[0] >= l_base && in[0] < l_base + l_count) {
     101           0 :         unsigned l_index = in[0] - l_base;
     102             :         unsigned v_index;
     103             : 
     104           0 :         if (in[1] < v_base || in[1] >= v_base + v_count)
     105           0 :             return 0;
     106           0 :         v_index = in[1] - v_base;
     107           0 :         return (l_index * v_count + v_index) * t_count + s_base;
     108       38880 :     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
     109           0 :         unsigned s_index = in[0] - s_base;
     110             :         unsigned t_index;
     111             : 
     112           0 :         if (s_index % t_count != 0)
     113           0 :             return 0;
     114           0 :         if (in[1] < t_base || in[1] >= t_base + t_count)
     115           0 :             return 0;
     116           0 :         t_index = in[1] - t_base;
     117           0 :         return in[0] + t_index;
     118             :     }
     119       38880 :     return 0;
     120             : }
     121             : 
     122             : static int
     123        2520 : compat_decomp(const uint32_t *in, size_t in_len,
     124             :               uint32_t *out, size_t *out_len)
     125             : {
     126             :     unsigned i;
     127        2520 :     unsigned o = 0;
     128             : 
     129       87840 :     for (i = 0; i < in_len; ++i) {
     130       41400 :         struct translation ts = {in[i]};
     131       41400 :         size_t sub_len = *out_len - o;
     132             :         int ret;
     133             : 
     134       41400 :         ret = hangul_decomp(in + i, in_len - i,
     135       41400 :                             out + o, &sub_len);
     136       41400 :         if (ret) {
     137           0 :             if (ret == WIND_ERR_OVERRUN)
     138           0 :                 return ret;
     139           0 :             o += sub_len;
     140             :         } else {
     141       41400 :             void *s = bsearch(&ts,
     142             :                               _wind_normalize_table,
     143             :                               _wind_normalize_table_size,
     144             :                               sizeof(_wind_normalize_table[0]),
     145             :                               translation_cmp);
     146       41400 :             if (s != NULL) {
     147           0 :                 const struct translation *t = (const struct translation *)s;
     148             : 
     149           0 :                 ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
     150           0 :                                     t->val_len,
     151           0 :                                     out + o, &sub_len);
     152           0 :                 if (ret)
     153           0 :                     return ret;
     154           0 :                 o += sub_len;
     155             :             } else {
     156       41400 :                 if (o >= *out_len)
     157           0 :                     return WIND_ERR_OVERRUN;
     158       41400 :                 out[o++] = in[i];
     159             : 
     160             :             }
     161             :         }
     162             :     }
     163        2520 :     *out_len = o;
     164        2520 :     return 0;
     165             : }
     166             : 
     167             : static void
     168           0 : swap_char(uint32_t * a, uint32_t * b)
     169             : {
     170             :     uint32_t t;
     171           0 :     t = *a;
     172           0 :     *a = *b;
     173           0 :     *b = t;
     174           0 : }
     175             : 
     176             : /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
     177             :  * that all have Canonical_Combining_Class > 0 */
     178             : static void
     179           0 : canonical_reorder_sequence(uint32_t * a, size_t len)
     180             : {
     181             :     size_t i, j;
     182             : 
     183           0 :     if (len <= 1)
     184           0 :         return;
     185             : 
     186           0 :     for (i = 1; i < len; i++) {
     187           0 :         for (j = i;
     188           0 :              j > 0 &&
     189           0 :                  _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
     190           0 :              j--)
     191           0 :             swap_char(&a[j], &a[j-1]);
     192             :     }
     193             : }
     194             : 
     195             : static void
     196        2520 : canonical_reorder(uint32_t *tmp, size_t tmp_len)
     197             : {
     198             :     size_t i;
     199             : 
     200       43920 :     for (i = 0; i < tmp_len; ++i) {
     201       41400 :         int cc = _wind_combining_class(tmp[i]);
     202       41400 :         if (cc) {
     203             :             size_t j;
     204           0 :             for (j = i + 1;
     205           0 :                  j < tmp_len && _wind_combining_class(tmp[j]);
     206           0 :                  ++j)
     207             :                 ;
     208           0 :             canonical_reorder_sequence(&tmp[i], j - i);
     209           0 :             i = j;
     210             :         }
     211             :     }
     212        2520 : }
     213             : 
     214             : static uint32_t
     215       38880 : find_composition(const uint32_t *in, unsigned in_len)
     216             : {
     217       38880 :     unsigned short canon_index = 0;
     218             :     uint32_t cur;
     219       38880 :     unsigned n = 0;
     220             : 
     221       38880 :     cur = hangul_composition(in, in_len);
     222       38880 :     if (cur)
     223           0 :         return cur;
     224             : 
     225             :     do {
     226      295560 :         const struct canon_node *c = &_wind_canon_table[canon_index];
     227             :         unsigned i;
     228             : 
     229      295560 :         if (n % 5 == 0) {
     230       73800 :             cur = *in++;
     231       73800 :             if (in_len-- == 0)
     232           0 :                 return c->val;
     233             :         }
     234             : 
     235      295560 :         i = cur >> 16;
     236      295560 :         if (i < c->next_start || i >= c->next_end)
     237       38880 :             canon_index = 0;
     238             :         else
     239      256680 :             canon_index =
     240      256680 :                 _wind_canon_next_table[c->next_offset + i - c->next_start];
     241      295560 :         if (canon_index != 0) {
     242      256680 :             cur = (cur << 4) & 0xFFFFF;
     243      256680 :             ++n;
     244             :         }
     245      295560 :     } while (canon_index != 0);
     246       38880 :     return 0;
     247             : }
     248             : 
     249             : static int
     250        2520 : combine(const uint32_t *in, size_t in_len,
     251             :         uint32_t *out, size_t *out_len)
     252             : {
     253             :     unsigned i;
     254             :     int ostarter;
     255        2520 :     unsigned o = 0;
     256             :     int old_cc;
     257             : 
     258       46440 :     for (i = 0; i < in_len;) {
     259       82800 :         while (i < in_len && _wind_combining_class(in[i]) != 0) {
     260           0 :             out[o++] = in[i++];
     261             :         }
     262       41400 :         if (i < in_len) {
     263       41400 :             if (o >= *out_len)
     264           0 :                 return WIND_ERR_OVERRUN;
     265       41400 :             ostarter = o;
     266       41400 :             out[o++] = in[i++];
     267       41400 :             old_cc   = -1;
     268             : 
     269       82800 :             while (i < in_len) {
     270             :                 uint32_t comb;
     271             :                 uint32_t v[2];
     272             :                 int cc;
     273             : 
     274       38880 :                 v[0] = out[ostarter];
     275       38880 :                 v[1] = in[i];
     276             : 
     277       38880 :                 cc = _wind_combining_class(in[i]);
     278       38880 :                 if (old_cc != cc && (comb = find_composition(v, 2))) {
     279           0 :                     out[ostarter] = comb;
     280       38880 :                 } else if (cc == 0) {
     281       38880 :                     break;
     282             :                 } else {
     283           0 :                     if (o >= *out_len)
     284           0 :                         return WIND_ERR_OVERRUN;
     285           0 :                     out[o++] = in[i];
     286           0 :                     old_cc   = cc;
     287             :                 }
     288           0 :                 ++i;
     289             :             }
     290             :         }
     291             :     }
     292        2520 :     *out_len = o;
     293        2520 :     return 0;
     294             : }
     295             : 
     296             : int
     297        2520 : _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
     298             :                            uint32_t *out, size_t *out_len)
     299             : {
     300             :     size_t tmp_len;
     301             :     uint32_t *tmp;
     302             :     int ret;
     303             : 
     304        2520 :     if (in_len == 0) {
     305           0 :         *out_len = 0;
     306           0 :         return 0;
     307             :     }
     308             : 
     309        2520 :     tmp_len = in_len * 4;
     310        2520 :     if (tmp_len < MAX_LENGTH_CANON)
     311         360 :         tmp_len = MAX_LENGTH_CANON;
     312        2520 :     tmp = malloc(tmp_len * sizeof(uint32_t));
     313        2520 :     if (tmp == NULL)
     314           0 :         return ENOMEM;
     315             : 
     316        2520 :     ret = compat_decomp(in, in_len, tmp, &tmp_len);
     317        2520 :     if (ret) {
     318           0 :         free(tmp);
     319           0 :         return ret;
     320             :     }
     321        2520 :     canonical_reorder(tmp, tmp_len);
     322        2520 :     ret = combine(tmp, tmp_len, out, out_len);
     323        2520 :     free(tmp);
     324        2520 :     return ret;
     325             : }

Generated by: LCOV version 1.13