Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 : Samba charset module for Mac OS X/Darwin
4 : Copyright (C) Benjamin Riefenstahl 2003
5 :
6 : This program is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 3 of the License, or
9 : (at your option) any later version.
10 :
11 : This program is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with this program. If not, see <http://www.gnu.org/licenses/>.
18 : */
19 :
20 : /*
21 : * modules/charset_macosxfs.c
22 : *
23 : * A Samba charset module to use on Mac OS X/Darwin as the filesystem
24 : * and display encoding.
25 : *
26 : * Actually two implementations are provided here. The default
27 : * implementation is based on the official CFString API. The other is
28 : * based on internal CFString APIs as defined in the OpenDarwin
29 : * source.
30 : */
31 :
32 : #include "replace.h"
33 : #include "charset.h"
34 : #include "charset_proto.h"
35 : #undef realloc
36 :
37 : #ifdef DARWINOS
38 :
39 : /*
40 : * Include OS frameworks. These are only needed in this module.
41 : */
42 : #include <CoreFoundation/CFString.h>
43 :
44 : /*
45 : * See if autoconf has found us the internal headers in some form.
46 : */
47 : #if defined(HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H)
48 : # include <CoreFoundation/CFStringEncodingConverter.h>
49 : # include <CoreFoundation/CFUnicodePrecomposition.h>
50 : # define USE_INTERNAL_API 1
51 : #elif defined(HAVE_CFSTRINGENCODINGCONVERTER_H)
52 : # include <CFStringEncodingConverter.h>
53 : # include <CFUnicodePrecomposition.h>
54 : # define USE_INTERNAL_API 1
55 : #endif
56 :
57 : /*
58 : * Compile time configuration: Do we want debug output?
59 : */
60 : /* #define DEBUG_STRINGS 1 */
61 :
62 : /*
63 : * A simple, but efficient memory provider for our buffers.
64 : */
65 : static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
66 : {
67 : if (newsize > *size) {
68 : *size = newsize + 128;
69 : buffer = realloc(buffer, *size);
70 : }
71 : return buffer;
72 : }
73 :
74 : /*
75 : * While there is a version of OpenDarwin for intel, the usual case is
76 : * big-endian PPC. So we need byte swapping to handle the
77 : * little-endian byte order of the network protocol. We also need an
78 : * additional dynamic buffer to do this work for incoming data blocks,
79 : * because we have to consider the original data as constant.
80 : *
81 : * We abstract the differences away by providing a simple facade with
82 : * these functions/macros:
83 : *
84 : * le_to_native(dst,src,len)
85 : * native_to_le(cp,len)
86 : * set_ucbuffer_with_le(buffer,bufsize,data,size)
87 : * set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
88 : */
89 : #ifdef WORDS_BIGENDIAN
90 :
91 : static inline void swap_bytes (char * dst, const char * src, size_t len)
92 : {
93 : const char *srcend = src + len;
94 : while (src < srcend) {
95 : dst[0] = src[1];
96 : dst[1] = src[0];
97 : dst += 2;
98 : src += 2;
99 : }
100 : }
101 : static inline void swap_bytes_inplace (char * cp, size_t len)
102 : {
103 : char temp;
104 : char *end = cp + len;
105 : while (cp < end) {
106 : temp = cp[1];
107 : cp[1] = cp[0];
108 : cp[0] = temp;
109 : cp += 2;
110 : }
111 : }
112 :
113 : #define le_to_native(dst,src,len) swap_bytes(dst,src,len)
114 : #define native_to_le(cp,len) swap_bytes_inplace(cp,len)
115 : #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
116 : set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
117 :
118 : #else /* ! WORDS_BIGENDIAN */
119 :
120 : #define le_to_native(dst,src,len) memcpy(dst,src,len)
121 : #define native_to_le(cp,len) /* nothing */
122 : #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
123 : (((void)(bufsize)),(UniChar*)(data))
124 :
125 : #endif
126 :
127 : static inline UniChar *set_ucbuffer_with_le_copy (
128 : UniChar *buffer, size_t *bufsize,
129 : const void *data, size_t size, size_t reserve)
130 : {
131 : buffer = resize_buffer(buffer, bufsize, size+reserve);
132 : le_to_native((char*)buffer,data,size);
133 : return buffer;
134 : }
135 :
136 :
137 : /*
138 : * A simple hexdump function for debugging error conditions.
139 : */
140 : #define debug_out(s) DEBUG(0,(s))
141 :
142 : #ifdef DEBUG_STRINGS
143 :
144 : static void hexdump( const char * label, const char * s, size_t len )
145 : {
146 : size_t restlen = len;
147 : debug_out("<<<<<<<\n");
148 : debug_out(label);
149 : debug_out("\n");
150 : while (restlen > 0) {
151 : char line[100];
152 : size_t i, j;
153 : char * d = line;
154 : #undef sprintf
155 : d += sprintf(d, "%04X ", (unsigned)(len-restlen));
156 : *d++ = ' ';
157 : for( i = 0; i<restlen && i<8; ++i ) {
158 : d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
159 : }
160 : for( j = i; j<8; ++j ) {
161 : d += sprintf(d, " ");
162 : }
163 : *d++ = ' ';
164 : for( i = 8; i<restlen && i<16; ++i ) {
165 : d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
166 : }
167 : for( j = i; j<16; ++j ) {
168 : d += sprintf(d, " ");
169 : }
170 : *d++ = ' ';
171 : for( i = 0; i<restlen && i<16; ++i ) {
172 : if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
173 : *d++ = '.';
174 : else
175 : *d++ = s[i];
176 : }
177 : *d++ = '\n';
178 : *d = 0;
179 : restlen -= i;
180 : s += i;
181 : debug_out(line);
182 : }
183 : debug_out(">>>>>>>\n");
184 : }
185 :
186 : #else /* !DEBUG_STRINGS */
187 :
188 : #define hexdump(label,s,len) /* nothing */
189 :
190 : #endif
191 :
192 :
193 : #if !USE_INTERNAL_API
194 :
195 : /*
196 : * An implementation based on documented Mac OS X APIs.
197 : *
198 : * This does a certain amount of memory management, creating and
199 : * manipulating CFString objects. We try to minimize the impact by
200 : * keeping those objects around and re-using them. We also use
201 : * external backing store for the CFStrings where this is possible and
202 : * benficial.
203 : *
204 : * The Unicode normalizations forms available at this level are
205 : * generic, not specifically for the file system. So they may not be
206 : * perfect fits.
207 : */
208 : size_t macosxfs_encoding_pull(
209 : void *cd, /* Encoder handle */
210 : const char **inbuf, size_t *inbytesleft, /* Script string */
211 : char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */
212 : {
213 : static const int script_code = kCFStringEncodingUTF8;
214 : static CFMutableStringRef cfstring = NULL;
215 : size_t outsize;
216 : CFRange range;
217 :
218 : (void) cd; /* UNUSED */
219 :
220 : if (0 == *inbytesleft) {
221 : return 0;
222 : }
223 :
224 : if (NULL == cfstring) {
225 : /*
226 : * A version with an external backing store as in the
227 : * push function should have been more efficient, but
228 : * testing shows, that it is actually slower (!).
229 : * Maybe kCFAllocatorDefault gets shortcut evaluation
230 : * internally, while kCFAllocatorNull doesn't.
231 : */
232 : cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
233 : }
234 :
235 : /*
236 : * Three methods of appending to a CFString, choose the most
237 : * efficient.
238 : */
239 : if (0 == (*inbuf)[*inbytesleft-1]) {
240 : CFStringAppendCString(cfstring, *inbuf, script_code);
241 : } else if (*inbytesleft <= 255) {
242 : Str255 buffer;
243 : buffer[0] = *inbytesleft;
244 : memcpy(buffer+1, *inbuf, buffer[0]);
245 : CFStringAppendPascalString(cfstring, buffer, script_code);
246 : } else {
247 : /*
248 : * We would like to use a fixed buffer and a loop
249 : * here, but than we can't garantee that the input is
250 : * well-formed UTF-8, as we are supposed to do.
251 : */
252 : static char *buffer = NULL;
253 : static size_t buflen = 0;
254 : buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
255 : memcpy(buffer, *inbuf, *inbytesleft);
256 : buffer[*inbytesleft] = 0;
257 : CFStringAppendCString(cfstring, *inbuf, script_code);
258 : }
259 :
260 : /*
261 : * Compose characters, using the non-canonical composition
262 : * form.
263 : */
264 : CFStringNormalize(cfstring, kCFStringNormalizationFormC);
265 :
266 : outsize = CFStringGetLength(cfstring);
267 : range = CFRangeMake(0,outsize);
268 :
269 : if (outsize == 0) {
270 : /*
271 : * HACK: smbd/mangle_hash2.c:is_legal_name() expects
272 : * errors here. That function will always pass 2
273 : * characters. smbd/open.c:check_for_pipe() cuts a
274 : * patchname to 10 characters blindly. Suppress the
275 : * debug output in those cases.
276 : */
277 : if(2 != *inbytesleft && 10 != *inbytesleft) {
278 : debug_out("String conversion: "
279 : "An unknown error occurred\n");
280 : hexdump("UTF8->UTF16LE (old) input",
281 : *inbuf, *inbytesleft);
282 : }
283 : errno = EILSEQ; /* Not sure, but this is what we have
284 : * actually seen. */
285 : return -1;
286 : }
287 : if (outsize*2 > *outbytesleft) {
288 : CFStringDelete(cfstring, range);
289 : debug_out("String conversion: "
290 : "Output buffer too small\n");
291 : hexdump("UTF8->UTF16LE (old) input",
292 : *inbuf, *inbytesleft);
293 : errno = E2BIG;
294 : return -1;
295 : }
296 :
297 : CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
298 : CFStringDelete(cfstring, range);
299 :
300 : native_to_le(*outbuf, outsize*2);
301 :
302 : /*
303 : * Add a converted null byte, if the CFString conversions
304 : * prevented that until now.
305 : */
306 : if (0 == (*inbuf)[*inbytesleft-1] &&
307 : (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
308 :
309 : if ((outsize*2+2) > *outbytesleft) {
310 : debug_out("String conversion: "
311 : "Output buffer too small\n");
312 : hexdump("UTF8->UTF16LE (old) input",
313 : *inbuf, *inbytesleft);
314 : errno = E2BIG;
315 : return -1;
316 : }
317 :
318 : (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
319 : outsize += 2;
320 : }
321 :
322 : *inbuf += *inbytesleft;
323 : *inbytesleft = 0;
324 : *outbuf += outsize*2;
325 : *outbytesleft -= outsize*2;
326 :
327 : return 0;
328 : }
329 :
330 : size_t macosxfs_encoding_push(
331 : void *cd, /* Encoder handle */
332 : const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
333 : char **outbuf, size_t *outbytesleft) /* Script string */
334 : {
335 : static const int script_code = kCFStringEncodingUTF8;
336 : static CFMutableStringRef cfstring = NULL;
337 : static UniChar *buffer = NULL;
338 : static size_t buflen = 0;
339 : CFIndex outsize, cfsize, charsconverted;
340 :
341 : (void) cd; /* UNUSED */
342 :
343 : if (0 == *inbytesleft) {
344 : return 0;
345 : }
346 :
347 : /*
348 : * We need a buffer that can hold 4 times the original data,
349 : * because that is the theoretical maximum that decomposition
350 : * can create currently (in Unicode 4.0).
351 : */
352 : buffer = set_ucbuffer_with_le_copy(
353 : buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
354 :
355 : if (NULL == cfstring) {
356 : cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
357 : kCFAllocatorDefault,
358 : buffer, *inbytesleft/2, buflen/2,
359 : kCFAllocatorNull);
360 : } else {
361 : CFStringSetExternalCharactersNoCopy(
362 : cfstring,
363 : buffer, *inbytesleft/2, buflen/2);
364 : }
365 :
366 : /*
367 : * Decompose characters, using the non-canonical decomposition
368 : * form.
369 : *
370 : * NB: This isn't exactly what HFS+ wants (see note on
371 : * kCFStringEncodingUseHFSPlusCanonical in
372 : * CFStringEncodingConverter.h), but AFAIK it's the best that
373 : * the official API can do.
374 : */
375 : CFStringNormalize(cfstring, kCFStringNormalizationFormD);
376 :
377 : cfsize = CFStringGetLength(cfstring);
378 : charsconverted = CFStringGetBytes(
379 : cfstring, CFRangeMake(0,cfsize),
380 : script_code, 0, false,
381 : *outbuf, *outbytesleft, &outsize);
382 :
383 : if (0 == charsconverted) {
384 : debug_out("String conversion: "
385 : "Buffer too small or not convertable\n");
386 : hexdump("UTF16LE->UTF8 (old) input",
387 : *inbuf, *inbytesleft);
388 : errno = EILSEQ; /* Probably more likely. */
389 : return -1;
390 : }
391 :
392 : /*
393 : * Add a converted null byte, if the CFString conversions
394 : * prevented that until now.
395 : */
396 : if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
397 : (0 != (*outbuf)[outsize-1])) {
398 :
399 : if (((size_t)outsize+1) > *outbytesleft) {
400 : debug_out("String conversion: "
401 : "Output buffer too small\n");
402 : hexdump("UTF16LE->UTF8 (old) input",
403 : *inbuf, *inbytesleft);
404 : errno = E2BIG;
405 : return -1;
406 : }
407 :
408 : (*outbuf)[outsize] = 0;
409 : ++outsize;
410 : }
411 :
412 : *inbuf += *inbytesleft;
413 : *inbytesleft = 0;
414 : *outbuf += outsize;
415 : *outbytesleft -= outsize;
416 :
417 : return 0;
418 : }
419 :
420 : #else /* USE_INTERNAL_API */
421 :
422 : /*
423 : * An implementation based on internal code as known from the
424 : * OpenDarwin CVS.
425 : *
426 : * This code doesn't need much memory management because it uses
427 : * functions that operate on the raw memory directly.
428 : *
429 : * The push routine here is faster and more compatible with HFS+ than
430 : * the other implementation above. The pull routine is only faster
431 : * for some strings, slightly slower for others. The pull routine
432 : * looses because it has to iterate over the data twice, once to
433 : * decode UTF-8 and than to do the character composition required by
434 : * Windows.
435 : */
436 : static size_t macosxfs_encoding_pull(
437 : void *cd, /* Encoder handle */
438 : const char **inbuf, size_t *inbytesleft, /* Script string */
439 : char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */
440 : {
441 : static const int script_code = kCFStringEncodingUTF8;
442 : UInt32 srcCharsUsed = 0;
443 : UInt32 dstCharsUsed = 0;
444 : UInt32 result;
445 : uint32_t dstDecomposedUsed = 0;
446 : uint32_t dstPrecomposedUsed = 0;
447 :
448 : (void) cd; /* UNUSED */
449 :
450 : if (0 == *inbytesleft) {
451 : return 0;
452 : }
453 :
454 : result = CFStringEncodingBytesToUnicode(
455 : script_code, kCFStringEncodingComposeCombinings,
456 : *inbuf, *inbytesleft, &srcCharsUsed,
457 : (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
458 :
459 : switch(result) {
460 : case kCFStringEncodingConversionSuccess:
461 : if (*inbytesleft == srcCharsUsed) {
462 : break;
463 : }
464 :
465 : FALL_THROUGH;
466 : case kCFStringEncodingInsufficientOutputBufferLength:
467 : debug_out("String conversion: "
468 : "Output buffer too small\n");
469 : hexdump("UTF8->UTF16LE (new) input",
470 : *inbuf, *inbytesleft);
471 : errno = E2BIG;
472 : return -1;
473 : case kCFStringEncodingInvalidInputStream:
474 : /*
475 : * HACK: smbd/mangle_hash2.c:is_legal_name() expects
476 : * errors here. That function will always pass 2
477 : * characters. smbd/open.c:check_for_pipe() cuts a
478 : * patchname to 10 characters blindly. Suppress the
479 : * debug output in those cases.
480 : */
481 : if(2 != *inbytesleft && 10 != *inbytesleft) {
482 : debug_out("String conversion: "
483 : "Invalid input sequence\n");
484 : hexdump("UTF8->UTF16LE (new) input",
485 : *inbuf, *inbytesleft);
486 : }
487 : errno = EILSEQ;
488 : return -1;
489 : case kCFStringEncodingConverterUnavailable:
490 : debug_out("String conversion: "
491 : "Unknown encoding\n");
492 : hexdump("UTF8->UTF16LE (new) input",
493 : *inbuf, *inbytesleft);
494 : errno = EINVAL;
495 : return -1;
496 : }
497 :
498 : /*
499 : * It doesn't look like CFStringEncodingBytesToUnicode() can
500 : * produce precomposed characters (flags=ComposeCombinings
501 : * doesn't do it), so we need another pass over the data here.
502 : * We can do this in-place, as the string can only get
503 : * shorter.
504 : *
505 : * (Actually in theory there should be an internal
506 : * decomposition and reordering before the actual composition
507 : * step. But we should be able to rely on that we always get
508 : * fully decomposed strings for input, so this can't create
509 : * problems in reality.)
510 : */
511 : CFUniCharPrecompose(
512 : (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
513 : (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
514 :
515 : native_to_le(*outbuf, dstPrecomposedUsed*2);
516 :
517 : *inbuf += srcCharsUsed;
518 : *inbytesleft -= srcCharsUsed;
519 : *outbuf += dstPrecomposedUsed*2;
520 : *outbytesleft -= dstPrecomposedUsed*2;
521 :
522 : return 0;
523 : }
524 :
525 : static size_t macosxfs_encoding_push(
526 : void *cd, /* Encoder handle */
527 : const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
528 : char **outbuf, size_t *outbytesleft) /* Script string */
529 : {
530 : static const int script_code = kCFStringEncodingUTF8;
531 : static UniChar *buffer = NULL;
532 : static size_t buflen = 0;
533 : UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
534 :
535 : (void) cd; /* UNUSED */
536 :
537 : if (0 == *inbytesleft) {
538 : return 0;
539 : }
540 :
541 : buffer = set_ucbuffer_with_le(
542 : buffer, &buflen, *inbuf, *inbytesleft);
543 :
544 : result = CFStringEncodingUnicodeToBytes(
545 : script_code, kCFStringEncodingUseHFSPlusCanonical,
546 : buffer, *inbytesleft/2, &srcCharsUsed,
547 : *outbuf, *outbytesleft, &dstCharsUsed);
548 :
549 : switch(result) {
550 : case kCFStringEncodingConversionSuccess:
551 : if (*inbytesleft/2 == srcCharsUsed) {
552 : break;
553 : }
554 :
555 : FALL_THROUGH;
556 : case kCFStringEncodingInsufficientOutputBufferLength:
557 : debug_out("String conversion: "
558 : "Output buffer too small\n");
559 : hexdump("UTF16LE->UTF8 (new) input",
560 : *inbuf, *inbytesleft);
561 : errno = E2BIG;
562 : return -1;
563 : case kCFStringEncodingInvalidInputStream:
564 : /*
565 : * HACK: smbd/open.c:check_for_pipe():is_legal_name()
566 : * cuts a pathname to 10 characters blindly. Suppress
567 : * the debug output in those cases.
568 : */
569 : if(10 != *inbytesleft) {
570 : debug_out("String conversion: "
571 : "Invalid input sequence\n");
572 : hexdump("UTF16LE->UTF8 (new) input",
573 : *inbuf, *inbytesleft);
574 : }
575 : errno = EILSEQ;
576 : return -1;
577 : case kCFStringEncodingConverterUnavailable:
578 : debug_out("String conversion: "
579 : "Unknown encoding\n");
580 : hexdump("UTF16LE->UTF8 (new) input",
581 : *inbuf, *inbytesleft);
582 : errno = EINVAL;
583 : return -1;
584 : }
585 :
586 : *inbuf += srcCharsUsed*2;
587 : *inbytesleft -= srcCharsUsed*2;
588 : *outbuf += dstCharsUsed;
589 : *outbytesleft -= dstCharsUsed;
590 :
591 : return 0;
592 : }
593 :
594 : #endif /* USE_INTERNAL_API */
595 :
596 : #else /* DARWIN */
597 :
598 : void charset_macosfs_dummy(void);
599 0 : void charset_macosfs_dummy(void)
600 : {
601 0 : return;
602 : }
603 :
604 : #endif /* DARWIN */
|