Handle multi-byte single-value encodings in conversion

mrbean-bremen · mrbean-bremen · commit 0d048a20f1e2 · 2025-12-19T08:38:43.000+01:00
- fixes the problem described in bug #684
diff --git a/dcmdata/include/dcmtk/dcmdata/dcspchrs.h b/dcmdata/include/dcmtk/dcmdata/dcspchrs.h
@@ -232,6 +232,15 @@ class DCMTK_DCMDATA_EXPORT DcmSpecificCharacterSet
      */
     static size_t countCharactersInUTF8String(const OFString &utf8String);
 
+    /** check whether the given Specific Character Set value belongs to a multi-byte
+     * character set that is only allowed as a single value
+     * in SpecificCharacterSet and that may contain bytes that look like ASCII (e.g.
+     * with the highest bit cleared), but are part of a multi-byte non-ASCII character.
+     * This currently includes only Chinese encodings, as in UTF-8 (ISO_IR 192) any byte
+     * belonging to a non-ASCII character has the highest bit set
+     */
+    static OFBool isNonAsciiConformMultiByteSingleValueCharacterSet(const OFString& charset);
+
 
   protected:
 
diff --git a/dcmdata/libsrc/dcspchrs.cc b/dcmdata/libsrc/dcspchrs.cc
@@ -587,10 +587,20 @@ OFCondition DcmSpecificCharacterSet::convertStringWithoutCodeExtensions(const ch
         size_t pos = 0;
         const char *firstChar = fromString;
         const char *currentChar = fromString;
+        const bool isMultiByte = isNonAsciiConformMultiByteSingleValueCharacterSet(SourceCharacterSet);
+
         // iterate over all characters of the string (as long as there is no error)
         while ((pos < fromLength) && status.good())
         {
             const char c0 = *currentChar++;
+            if (isMultiByte && (c0 & 0x80) != 0)
+            {
+                // this is a 2-byte character or the first or second part
+                // of a 4-byte character - skip the next byte
+                currentChar++;
+                pos += 2;
+                continue;
+            }
             // check for characters HT, LF, FF, CR or any other specified delimiter
             const OFBool isDelimiter =  ((c0 == '\011') || (c0 == '\012') || (c0 == '\014') || (c0 == '\015') ||
                 (delimiters.find(c0) != OFString_npos));
@@ -936,3 +946,8 @@ size_t DcmSpecificCharacterSet::countCharactersInUTF8String(const OFString &utf8
     // just call the appropriate function from the underlying class
     return OFCharacterEncoding::countCharactersInUTF8String(utf8String);
 }
+
+OFBool DcmSpecificCharacterSet::isNonAsciiConformMultiByteSingleValueCharacterSet(const OFString& charset)
+{
+    return charset == "GBK" || charset == "GB18030";
+}
diff --git a/dcmdata/tests/tspchrs.cc b/dcmdata/tests/tspchrs.cc
@@ -155,6 +155,10 @@ OFTEST(dcmdata_specificCharacterSet_3)
         OFCHECK(converter.selectCharacterSet("GB18030").good());
         OFCHECK(converter.convertString("Wang^XiaoDong=\315\365^\320\241\266\253=", resultStr, delimiters).good());
         OFCHECK_EQUAL(resultStr, "Wang^XiaoDong=\347\216\213^\345\260\217\344\270\234=");
+        // check whether a byte looking like a delimiter inside a multi-byte character is not handled as delimiter
+        // 0x5c is the byte for a backslash in single-byte encodings, but here part of two Kanji characters
+        OFCHECK(converter.convertString("Noriwa=\x81\x5c\x82\x5c", resultStr, delimiters).good());
+        OFCHECK_EQUAL(resultStr, "Noriwa=\xe4\xb9\x97\xe4\xbf\x93");
         // check whether string conversion from Chinese language to UTF-8 works
         // example taken from DICOM PS 3.5 Annex K.2
         OFCHECK(converter.selectCharacterSet("\\ISO 2022 IR 58").good());