···11+/*
22+This file is part of Darling.
33+44+Copyright (C) 2017 Lubos Dolezel
55+66+Darling is free software: you can redistribute it and/or modify
77+it under the terms of the GNU General Public License as published by
88+the Free Software Foundation, either version 3 of the License, or
99+(at your option) any later version.
1010+1111+Darling is distributed in the hope that it will be useful,
1212+but WITHOUT ANY WARRANTY; without even the implied warranty of
1313+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414+GNU General Public License for more details.
1515+1616+You should have received a copy of the GNU General Public License
1717+along with Darling. If not, see <http://www.gnu.org/licenses/>.
1818+*/
1919+2020+#include "TextCommon.h"
2121+2222+TextEncoding CreateTextEncoding(TextEncodingBase encodingBase, TextEncodingVariant encodingVariant, TextEncodingFormat encodingFormat)
2323+{
2424+ TextEncoding rv = encodingBase & 0xff;
2525+ rv |= (encodingVariant << 8) & 0xff00;
2626+ rv |= (encodingFormat << 16) & 0xff0000;
2727+ return rv;
2828+}
2929+3030+3131+
+75
src/CoreServices/TextCommon.h
···11+/*
22+This file is part of Darling.
33+44+Copyright (C) 2017 Lubos Dolezel
55+66+Darling is free software: you can redistribute it and/or modify
77+it under the terms of the GNU General Public License as published by
88+the Free Software Foundation, either version 3 of the License, or
99+(at your option) any later version.
1010+1111+Darling is distributed in the hope that it will be useful,
1212+but WITHOUT ANY WARRANTY; without even the implied warranty of
1313+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414+GNU General Public License for more details.
1515+1616+You should have received a copy of the GNU General Public License
1717+along with Darling. If not, see <http://www.gnu.org/licenses/>.
1818+*/
1919+2020+#ifndef _TEXTCOMMON_H
2121+#define _TEXTCOMMON_H
2222+#include <MacTypes.h>
2323+2424+#ifdef __cplusplus
2525+extern "C" {
2626+#endif
2727+2828+typedef UInt8* TextPtr;
2929+typedef const UInt8* ConstTextPtr;
3030+3131+enum {
3232+ kTextEncodingUnicodeDefault = 0x100
3333+};
3434+3535+enum {
3636+ kTextEncodingDefaultVariant = 0
3737+};
3838+3939+enum {
4040+ kTextEncodingDefaultFormat = 0,
4141+ kUnicodeUTF16Format = 0,
4242+ kUnicodeUTF7Format = 1,
4343+ kUnicodeUTF8Format = 2,
4444+ kUnicodeUTF32Format = 3,
4545+ kUnicodeUTF16BEFormat = 4,
4646+ kUnicodeUTF16LEFormat = 5,
4747+ kUnicodeUTF32BEFormat = 6,
4848+ kUnicodeUTF32LEFormat = 7,
4949+ kUnicodeSCSUFormat = 8,
5050+ kUnicode16BitFormat = 0,
5151+ kUnicode32BitFormat = 3,
5252+};
5353+5454+enum {
5555+ kUnicodeNoSubset = 0,
5656+ kUnicodeNormalizationFormD = 5,
5757+ kUnicodeNormalizationFormC = 3,
5858+ kUnicodeCanonicalCompVariant = kUnicodeNormalizationFormC,
5959+ kUnicodeHFSPlusDecompVariant = 8,
6060+ kUnicodeHFSPlusCompVariant = 9,
6161+};
6262+6363+typedef UInt32 TextEncoding;
6464+typedef UInt32 TextEncodingBase;
6565+typedef UInt32 TextEncodingFormat;
6666+typedef UInt32 TextEncodingVariant;
6767+6868+TextEncoding CreateTextEncoding(TextEncodingBase encodingBase, TextEncodingVariant encodingVariant, TextEncodingFormat encodingFormat);
6969+7070+#ifdef __cplusplus
7171+}
7272+#endif
7373+7474+#endif
7575+
+233
src/CoreServices/TextEncodingConverter.cpp
···11+/*
22+This file is part of Darling.
33+44+Copyright (C) 2017 Lubos Dolezel
55+66+Darling is free software: you can redistribute it and/or modify
77+it under the terms of the GNU General Public License as published by
88+the Free Software Foundation, either version 3 of the License, or
99+(at your option) any later version.
1010+1111+Darling is distributed in the hope that it will be useful,
1212+but WITHOUT ANY WARRANTY; without even the implied warranty of
1313+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414+GNU General Public License for more details.
1515+1616+You should have received a copy of the GNU General Public License
1717+along with Darling. If not, see <http://www.gnu.org/licenses/>.
1818+*/
1919+2020+#include "TextEncodingConverter.h"
2121+#include <unicode/ucnv.h>
2222+#include <unicode/normalizer2.h>
2323+#include "MacErrors.h"
2424+2525+struct OpaqueTECObjectRef
2626+{
2727+ UConverter* inputConverter;
2828+ UConverter* outputConverter;
2929+ const UNormalizer2* normalizer;
3030+ UChar buffer[64];
3131+ size_t bufferUsed;
3232+};
3333+3434+static UConverter* createConverter(TextEncodingBase base, TextEncodingFormat format)
3535+{
3636+ switch (base)
3737+ {
3838+ case kTextEncodingUnicodeDefault:
3939+ {
4040+ const char* enc;
4141+ UErrorCode error = U_ZERO_ERROR;
4242+4343+ switch (format)
4444+ {
4545+ case kUnicodeUTF16Format:
4646+ enc = "UTF-16";
4747+ break;
4848+ case kUnicodeUTF7Format:
4949+ enc = "UTF-7";
5050+ break;
5151+ case kUnicodeUTF8Format:
5252+ enc = "UTF-8";
5353+ break;
5454+ case kUnicodeUTF32Format:
5555+ enc = "UTF-32";
5656+ break;
5757+ case kUnicodeUTF16BEFormat:
5858+ enc = "UTF-16BE";
5959+ break;
6060+ case kUnicodeUTF16LEFormat:
6161+ enc = "UTF-16LE";
6262+ break;
6363+ case kUnicodeUTF32BEFormat:
6464+ enc = "UTF-32BE";
6565+ break;
6666+ case kUnicodeUTF32LEFormat:
6767+ enc = "UTF-32LE";
6868+ break;
6969+ default:
7070+ return NULL;
7171+ }
7272+7373+ return ucnv_open(enc, &error);
7474+ }
7575+ default:
7676+ return NULL;
7777+ }
7878+}
7979+8080+OSStatus TECCreateConverter(TECObjectRef *newEncodingConverter, TextEncoding inputEncoding, TextEncoding outputEncoding)
8181+{
8282+ TextEncodingFormat format;
8383+ TextEncodingBase base;
8484+ TextEncodingVariant variant;
8585+ OpaqueTECObjectRef* obj = new OpaqueTECObjectRef;
8686+8787+ obj->inputConverter = obj->outputConverter = NULL;
8888+ obj->normalizer = NULL;
8989+ obj->bufferUsed = 0;
9090+9191+ format = (inputEncoding >> 16) & 0xff;
9292+ base = (inputEncoding >> 0) & 0xff;
9393+ variant = (inputEncoding >> 8) & 0xff;
9494+9595+ obj->inputConverter = createConverter(base, format);
9696+ if (!obj->inputConverter)
9797+ {
9898+ TECDisposeConverter(obj);
9999+ *newEncodingConverter = NULL;
100100+ return unimpErr;
101101+ }
102102+103103+ format = (outputEncoding >> 16) & 0xff;
104104+ base = (outputEncoding >> 0) & 0xff;
105105+ variant = (outputEncoding >> 8) & 0xff;
106106+107107+ obj->outputConverter = createConverter(base, format);
108108+ if (!obj->outputConverter)
109109+ {
110110+ TECDisposeConverter(obj);
111111+ *newEncodingConverter = NULL;
112112+ return unimpErr;
113113+ }
114114+115115+ if (base == kTextEncodingUnicodeDefault)
116116+ {
117117+ switch (variant)
118118+ {
119119+ case kUnicodeNoSubset:
120120+ break;
121121+ case kUnicodeNormalizationFormD:
122122+ {
123123+ UErrorCode error = U_ZERO_ERROR;
124124+ obj->normalizer = unorm2_getNFDInstance(&error);
125125+ break;
126126+ }
127127+ case kUnicodeNormalizationFormC:
128128+ {
129129+ UErrorCode error = U_ZERO_ERROR;
130130+ obj->normalizer = unorm2_getNFCInstance(&error);
131131+ break;
132132+ }
133133+ case kUnicodeHFSPlusDecompVariant:
134134+ case kUnicodeHFSPlusCompVariant:
135135+ {
136136+ UErrorCode error = U_ZERO_ERROR;
137137+ obj->normalizer = unorm2_getNFKDInstance(&error);
138138+ break;
139139+ }
140140+ }
141141+ }
142142+143143+ *newEncodingConverter = obj;
144144+ return noErr;
145145+}
146146+147147+OSStatus TECConvertText(TECObjectRef encodingConverter, ConstTextPtr inputBuffer,
148148+ ByteCount inputBufferLength, ByteCount *actualInputLength,
149149+ TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount *actualOutputLength)
150150+{
151151+ if (actualInputLength != NULL)
152152+ *actualInputLength = 0;
153153+ *actualOutputLength = 0;
154154+155155+ while (outputBufferLength > 0)
156156+ {
157157+ if (encodingConverter->bufferUsed > 0)
158158+ {
159159+ // flush buffer
160160+ UErrorCode error = U_ZERO_ERROR;
161161+ char* target = (char*) outputBuffer;
162162+ const UChar* source = encodingConverter->buffer;
163163+ ByteCount inputUsed;
164164+165165+166166+ ucnv_fromUnicode(encodingConverter->outputConverter,
167167+ &target, target + outputBufferLength,
168168+ &source, source + encodingConverter->bufferUsed,
169169+ NULL, false, &error);
170170+171171+ if (error != U_ZERO_ERROR && error != U_BUFFER_OVERFLOW_ERROR)
172172+ return paramErr;
173173+174174+ *actualOutputLength = target - ((char*)outputBuffer);
175175+176176+ inputUsed = source - encodingConverter->buffer;
177177+ if (inputUsed < encodingConverter->bufferUsed)
178178+ {
179179+ memmove(encodingConverter->buffer,
180180+ encodingConverter->buffer + inputUsed,
181181+ encodingConverter->bufferUsed - inputUsed);
182182+ }
183183+ encodingConverter->bufferUsed -= inputUsed;
184184+ outputBuffer += *actualOutputLength;
185185+ outputBufferLength -= *actualOutputLength;
186186+187187+ if (error == U_BUFFER_OVERFLOW_ERROR)
188188+ break;
189189+ }
190190+191191+ if (inputBufferLength <= 0)
192192+ break;
193193+194194+ // Consume input
195195+ {
196196+ UChar* target = encodingConverter->buffer + encodingConverter->bufferUsed;
197197+ const char* source = (const char*) inputBuffer;
198198+ UErrorCode error = U_ZERO_ERROR;
199199+200200+ ucnv_toUnicode(encodingConverter->inputConverter,
201201+ &target, encodingConverter->buffer + (sizeof(encodingConverter->buffer) / sizeof(encodingConverter->buffer[0])),
202202+ &source, source + inputBufferLength,
203203+ NULL, false, &error);
204204+205205+ if (error != U_ZERO_ERROR && error != U_BUFFER_OVERFLOW_ERROR)
206206+ return paramErr;
207207+208208+ encodingConverter->bufferUsed += target - (encodingConverter->buffer + encodingConverter->bufferUsed);
209209+ *actualInputLength += source - ((const char*)inputBuffer);
210210+ inputBufferLength -= source - ((const char*)inputBuffer);
211211+ inputBuffer = (ConstTextPtr) source;
212212+ }
213213+214214+ // TODO: normalize
215215+ // Normalization may cause the data to no longer fit into our internal buffer :-/
216216+ }
217217+218218+ return noErr;
219219+}
220220+221221+OSStatus TECFlushText(TECObjectRef encodingConverter, TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount *actualOutputLength)
222222+{
223223+ return TECConvertText(encodingConverter, NULL, 0, NULL, outputBuffer, outputBufferLength, actualOutputLength);
224224+}
225225+226226+OSStatus TECDisposeConverter(TECObjectRef conv)
227227+{
228228+ ucnv_close(conv->inputConverter);
229229+ ucnv_close(conv->outputConverter);
230230+ delete conv;
231231+ return noErr;
232232+}
233233+
+44
src/CoreServices/TextEncodingConverter.h
···11+/*
22+This file is part of Darling.
33+44+Copyright (C) 2017 Lubos Dolezel
55+66+Darling is free software: you can redistribute it and/or modify
77+it under the terms of the GNU General Public License as published by
88+the Free Software Foundation, either version 3 of the License, or
99+(at your option) any later version.
1010+1111+Darling is distributed in the hope that it will be useful,
1212+but WITHOUT ANY WARRANTY; without even the implied warranty of
1313+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414+GNU General Public License for more details.
1515+1616+You should have received a copy of the GNU General Public License
1717+along with Darling. If not, see <http://www.gnu.org/licenses/>.
1818+*/
1919+2020+#ifndef _TEXTENCODINGCONVERTER_H
2121+#define _TEXTENCODINGCONVERTER_H
2222+#include <MacTypes.h>
2323+#include "TextCommon.h"
2424+2525+#ifdef __cplusplus
2626+extern "C" {
2727+#endif
2828+2929+typedef struct OpaqueTECObjectRef* TECObjectRef;
3030+3131+OSStatus TECCreateConverter(TECObjectRef *newEncodingConverter, TextEncoding inputEncoding, TextEncoding outputEncoding);
3232+3333+OSStatus TECConvertText(TECObjectRef encodingConverter, ConstTextPtr inputBuffer, ByteCount inputBufferLength, ByteCount *actualInputLength, TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount *actualOutputLength);
3434+3535+OSStatus TECFlushText(TECObjectRef encodingConverter, TextPtr outputBuffer, ByteCount outputBufferLength, ByteCount *actualOutputLength);
3636+3737+OSStatus TECDisposeConverter(TECObjectRef newEncodingConverter);
3838+3939+#ifdef __cplusplus
4040+}
4141+#endif
4242+4343+#endif
4444+