Skip to content

Commit 0d048a2

Browse files
committed
Handle multi-byte single-value encodings in conversion
- fixes the problem described in bug #684
1 parent d360b01 commit 0d048a2

3 files changed

Lines changed: 28 additions & 0 deletions

File tree

dcmdata/include/dcmtk/dcmdata/dcspchrs.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,15 @@ class DCMTK_DCMDATA_EXPORT DcmSpecificCharacterSet
232232
*/
233233
static size_t countCharactersInUTF8String(const OFString &utf8String);
234234

235+
/** check whether the given Specific Character Set value belongs to a multi-byte
236+
* character set that is only allowed as a single value
237+
* in SpecificCharacterSet and that may contain bytes that look like ASCII (e.g.
238+
* with the highest bit cleared), but are part of a multi-byte non-ASCII character.
239+
* This currently includes only Chinese encodings, as in UTF-8 (ISO_IR 192) any byte
240+
* belonging to a non-ASCII character has the highest bit set
241+
*/
242+
static OFBool isNonAsciiConformMultiByteSingleValueCharacterSet(const OFString& charset);
243+
235244

236245
protected:
237246

dcmdata/libsrc/dcspchrs.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,10 +587,20 @@ OFCondition DcmSpecificCharacterSet::convertStringWithoutCodeExtensions(const ch
587587
size_t pos = 0;
588588
const char *firstChar = fromString;
589589
const char *currentChar = fromString;
590+
const bool isMultiByte = isNonAsciiConformMultiByteSingleValueCharacterSet(SourceCharacterSet);
591+
590592
// iterate over all characters of the string (as long as there is no error)
591593
while ((pos < fromLength) && status.good())
592594
{
593595
const char c0 = *currentChar++;
596+
if (isMultiByte && (c0 & 0x80) != 0)
597+
{
598+
// this is a 2-byte character or the first or second part
599+
// of a 4-byte character - skip the next byte
600+
currentChar++;
601+
pos += 2;
602+
continue;
603+
}
594604
// check for characters HT, LF, FF, CR or any other specified delimiter
595605
const OFBool isDelimiter = ((c0 == '\011') || (c0 == '\012') || (c0 == '\014') || (c0 == '\015') ||
596606
(delimiters.find(c0) != OFString_npos));
@@ -936,3 +946,8 @@ size_t DcmSpecificCharacterSet::countCharactersInUTF8String(const OFString &utf8
936946
// just call the appropriate function from the underlying class
937947
return OFCharacterEncoding::countCharactersInUTF8String(utf8String);
938948
}
949+
950+
OFBool DcmSpecificCharacterSet::isNonAsciiConformMultiByteSingleValueCharacterSet(const OFString& charset)
951+
{
952+
return charset == "GBK" || charset == "GB18030";
953+
}

dcmdata/tests/tspchrs.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ OFTEST(dcmdata_specificCharacterSet_3)
155155
OFCHECK(converter.selectCharacterSet("GB18030").good());
156156
OFCHECK(converter.convertString("Wang^XiaoDong=\315\365^\320\241\266\253=", resultStr, delimiters).good());
157157
OFCHECK_EQUAL(resultStr, "Wang^XiaoDong=\347\216\213^\345\260\217\344\270\234=");
158+
// check whether a byte looking like a delimiter inside a multi-byte character is not handled as delimiter
159+
// 0x5c is the byte for a backslash in single-byte encodings, but here part of two Kanji characters
160+
OFCHECK(converter.convertString("Noriwa=\x81\x5c\x82\x5c", resultStr, delimiters).good());
161+
OFCHECK_EQUAL(resultStr, "Noriwa=\xe4\xb9\x97\xe4\xbf\x93");
158162
// check whether string conversion from Chinese language to UTF-8 works
159163
// example taken from DICOM PS 3.5 Annex K.2
160164
OFCHECK(converter.selectCharacterSet("\\ISO 2022 IR 58").good());

0 commit comments

Comments
 (0)