File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -232,6 +232,15 @@ class DCMTK_DCMDATA_EXPORT DcmSpecificCharacterSet
232232 */
233233 static size_t countCharactersInUTF8String (const OFString &utf8String);
234234
235+ /* * check whether the given Specific Character Set value belongs to a multi-byte
236+ * character set that is only allowed as a single value
237+ * in SpecificCharacterSet and that may contain bytes that look like ASCII (e.g.
238+ * with the highest bit cleared), but are part of a multi-byte non-ASCII character.
239+ * This currently includes only Chinese encodings, as in UTF-8 (ISO_IR 192) any byte
240+ * belonging to a non-ASCII character has the highest bit set
241+ */
242+ static OFBool isNonAsciiConformMultiByteSingleValueCharacterSet (const OFString& charset);
243+
235244
236245 protected:
237246
Original file line number Diff line number Diff line change @@ -587,10 +587,20 @@ OFCondition DcmSpecificCharacterSet::convertStringWithoutCodeExtensions(const ch
587587 size_t pos = 0 ;
588588 const char *firstChar = fromString;
589589 const char *currentChar = fromString;
590+ const bool isMultiByte = isNonAsciiConformMultiByteSingleValueCharacterSet (SourceCharacterSet);
591+
590592 // iterate over all characters of the string (as long as there is no error)
591593 while ((pos < fromLength) && status.good ())
592594 {
593595 const char c0 = *currentChar++;
596+ if (isMultiByte && (c0 & 0x80 ) != 0 )
597+ {
598+ // this is a 2-byte character or the first or second part
599+ // of a 4-byte character - skip the next byte
600+ currentChar++;
601+ pos += 2 ;
602+ continue ;
603+ }
594604 // check for characters HT, LF, FF, CR or any other specified delimiter
595605 const OFBool isDelimiter = ((c0 == ' \011 ' ) || (c0 == ' \012 ' ) || (c0 == ' \014 ' ) || (c0 == ' \015 ' ) ||
596606 (delimiters.find (c0) != OFString_npos));
@@ -936,3 +946,8 @@ size_t DcmSpecificCharacterSet::countCharactersInUTF8String(const OFString &utf8
936946 // just call the appropriate function from the underlying class
937947 return OFCharacterEncoding::countCharactersInUTF8String (utf8String);
938948}
949+
950+ OFBool DcmSpecificCharacterSet::isNonAsciiConformMultiByteSingleValueCharacterSet (const OFString& charset)
951+ {
952+ return charset == " GBK" || charset == " GB18030" ;
953+ }
Original file line number Diff line number Diff line change @@ -155,6 +155,10 @@ OFTEST(dcmdata_specificCharacterSet_3)
155155 OFCHECK (converter.selectCharacterSet (" GB18030" ).good ());
156156 OFCHECK (converter.convertString (" Wang^XiaoDong=\315\365 ^\320\241\266\253 =" , resultStr, delimiters).good ());
157157 OFCHECK_EQUAL (resultStr, " Wang^XiaoDong=\347\216\213 ^\345\260\217\344\270\234 =" );
158+ // check whether a byte looking like a delimiter inside a multi-byte character is not handled as delimiter
159+ // 0x5c is the byte for a backslash in single-byte encodings, but here part of two Kanji characters
160+ OFCHECK (converter.convertString (" Noriwa=\x81\x5c\x82\x5c " , resultStr, delimiters).good ());
161+ OFCHECK_EQUAL (resultStr, " Noriwa=\xe4\xb9\x97\xe4\xbf\x93 " );
158162 // check whether string conversion from Chinese language to UTF-8 works
159163 // example taken from DICOM PS 3.5 Annex K.2
160164 OFCHECK (converter.selectCharacterSet (" \\ ISO 2022 IR 58" ).good ());
You can’t perform that action at this time.
0 commit comments